From 3829fecfe3b36dba4372baf7598c3e4f1a53d557 Mon Sep 17 00:00:00 2001 From: asoc1al Date: Sat, 19 Oct 2024 23:03:18 +0400 Subject: [PATCH] vrode all done --- lab2.ipynb | 935 ++++++++++++++++++++++++++++++++++++++++++++++++- poetry.lock | 183 +++++++++- pyproject.toml | 4 + 3 files changed, 1106 insertions(+), 16 deletions(-) diff --git a/lab2.ipynb b/lab2.ipynb index f111cce..68b73a3 100644 --- a/lab2.ipynb +++ b/lab2.ipynb @@ -99,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -351,7 +351,7 @@ "[5110 rows x 12 columns]" ] }, - "execution_count": 1, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -360,13 +360,12 @@ "import pandas as pd\n", "\n", "var4 = pd.read_csv(\"./datasets/var4/healthcare-dataset-stroke-data.csv\")\n", - "\n", "var4" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -387,7 +386,7 @@ "dtype: object" ] }, - "execution_count": 2, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -406,7 +405,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -779,7 +778,7 @@ "[21613 rows x 21 columns]" ] }, - "execution_count": 7, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -791,7 +790,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -821,7 +820,7 @@ "dtype: object" ] }, - "execution_count": 8, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -840,7 +839,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -1216,7 +1215,7 @@ "[1370 rows x 18 columns]" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -1228,7 +1227,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -1255,7 +1254,7 @@ "dtype: object" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -1268,13 +1267,919 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 3. Провести анализ содержимого каждого набора данных. Что является объектом/объектами наблюдения? Каковы атрибуты объектов? Есть ли связи между объектами?" + "### 3. Провести анализ содержимого каждого набора данных. Что является объектом/объектами наблюдения? Каковы атрибуты объектов? Есть ли связи между объектами?\n", + "\n", + "1. Датасет о риске инсульта\n", + " - Объект наблюжения: Пациенты\n", + "2. Датасет о продажах недвижимости\n", + " - Объект наблюдения: Сделки по проданным домам в King Country, США\n", + "3. Датасет о цене мобильных устройств\n", + " - Объект наблюдения: Модели телефонов и их цены\n", + "\n", + "Все аттрибуты были перечислены выше." ] }, { "cell_type": "markdown", "metadata": {}, - "source": [] + "source": [ + "### 4. Привести примеры бизнес-целей, для достижения которых могут подойти выбранные наборы данных. Каков эффект для бизнеса?\n", + "1. Датасет о риске инсульта\n", + " - Бизнес-цель: Определить факторы риска инсульта и предохранить пациентов от инсульта.\n", + " - Эффект для бизнеса: Снижение количества случаев инсульта, снижение затрат на лечение и улучшение репутации клиники.\n", + "2. Датасет о продажах недвижимости\n", + " - Бизнес-цель: Определить факторы, влияющие на продажи недвижимости\n", + " - Эффект для бизнеса: Улучшение стратегии продаж, повышение эффективности подбора имущества для последующего извлесения прибыли\n", + "3. Датасет о цене мобильных устройств\n", + " - Бизнес-цель: Определить факторы, влияющие на цену мобильных устройств\n", + " - Эффект для бизнеса: Улучшение стратегии ценообразования, повышение эффективности продаж и прибыли." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Привести примеры целей технического проекта для каждой выделенной ранее бизнес-цели. Что поступает на вход, что является целевым признаком?\n", + "1. Датасет о риске инсульта\n", + " - Бизнес-цель: Разработка системы раннего предупреждения инсульта.\n", + " - Цель технического проекта: Создание модели машинного обучения для прогнозирования вероятности инсульта.\n", + " - Входные данные:\n", + " Пол Возраст Наличие гипертензии Наличие сердечных заболеваний Статус брака Тип работы Тип проживания Средний уровень глюкозы Индекс массы тела Статус курения и т.д.\n", + " - Целевой признак: Наличие инсульта (stroke).\n", + "2. Датасет о продажах недвижимости\n", + " - Бизнес-цель: Развитие системы рекомендации недвижимости рекомендованной к покупке для последующей перепродажи.\n", + " - Цель технического проекта: Разработка модели машинного обучения для прогнозирования цены недвижимости.\n", + " - Входные данные:\n", + " Площадь Площадь комнат Площадь участка Тип дома Тип комнат и другие признаки.\n", + " - Целевой признак: Цена недвижимости (Price).\n", + "3. Датасет о цене мобильных устройств\n", + " - Бизнес-цель: Оптимизация ценообразования и улучшение стратегии продаж мобильных устройств.\n", + " - Цель технического проекта: Построение модели для предсказания рекомендованной цены мобильного устройства на основе характеристик.\n", + " - Входные данные:\n", + " Имя Рейтинг Очки производительности Кол-во SIM-слотов Оперативная память Емкость аккумклятора Дисплей Камера Дополнительные слоты для карт памяти и остальное.\n", + " - Целевой признак: Очки производительности (Spec_score)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6. Определить проблемы выбранных наборов данных: зашумленность, смещение, актуальность, выбросы, просачивание данных.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7. Привести примеры решения обнаруженных проблем для каждого набора данных¶\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# 1. Проверка на зашумленность ---- количество пропусков в процентах от общего кол-ва\n", + "def check_noise(dataframe):\n", + " total_values = dataframe.size\n", + " missing_values = dataframe.isnull().sum().sum()\n", + " noise_percentage = (missing_values / total_values) * 100\n", + " return f\"Зашумленность: {noise_percentage:.2f}%\"\n", + "\n", + "# 2. Проверка на смещение ----- объем уникальных значений внутри определнной колонки \n", + "def check_bias(dataframe, target_column):\n", + " if target_column in dataframe.columns:\n", + " unique_values = dataframe[target_column].nunique()\n", + " total_values = len(dataframe)\n", + " bias_percentage = (unique_values / total_values) * 100\n", + " return f\"Смещение по {target_column}: {bias_percentage:.2f}% уникальных значений\"\n", + " return \"Целевой признак не найден.\"\n", + "\n", + "# 3. Проверка на дубликаты\n", + "def check_duplicates(dataframe):\n", + " duplicate_percentage = dataframe.duplicated().mean() * 100\n", + " return f\"Количество дубликатов: {duplicate_percentage:.2f}%\"\n", + "\n", + "# 4. Проверка на выбросы\n", + "def check_outliers(dataframe, column):\n", + " if column in dataframe.columns:\n", + " Q1 = dataframe[column].quantile(0.25)\n", + " Q3 = dataframe[column].quantile(0.75)\n", + " IQR = Q3 - Q1\n", + " lower_bound = Q1 - 1.5 * IQR\n", + " upper_bound = Q3 + 1.5 * IQR\n", + " outlier_count = dataframe[(dataframe[column] < lower_bound) | (dataframe[column] > upper_bound)].shape[0]\n", + " total_count = dataframe.shape[0]\n", + " outlier_percentage = (outlier_count / total_count) * 100\n", + " return f\"Выбросы по {column}: {outlier_percentage:.2f}%\"\n", + " return f\"Признак {column} не найден.\"\n", + "\n", + "# 5. Проверка на просачивание данных\n", + "def check_data_leakage(dataframe, target_column):\n", + " if target_column in dataframe.columns:\n", + " correlation_matrix = dataframe.select_dtypes(include=[np.number]).corr()\n", + " leakage_info = correlation_matrix[target_column].abs().nlargest(10)\n", + " leakage_report = \", \".join([f\"{feature}: {value:.2f}\" for feature, value in leakage_info.items() if feature != target_column])\n", + " return f\"Признаки просачивания данных: {leakage_report}\"\n", + " return \"Целевой признак не найден.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Датасет о риске инсульта:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Зашумленность: 0.33%\n", + "Смещение по avg_glucose_level: 77.87% уникальных значений\n", + "Количество дубликатов: 0.00%\n", + "Выбросы по avg_glucose_level: 12.27%\n", + "Признаки просачивания данных: age: 0.25, heart_disease: 0.13, avg_glucose_level: 0.13, hypertension: 0.13, bmi: 0.04, id: 0.01\n" + ] + } + ], + "source": [ + "noise_columns = check_noise(var4)\n", + "bias_info = check_bias(var4, 'avg_glucose_level') \n", + "duplicate_count = check_duplicates(var4)\n", + "outliers_data = check_outliers(var4, 'avg_glucose_level') \n", + "leakage_info = check_data_leakage(var4, 'stroke') \n", + "\n", + "print(noise_columns)\n", + "print(bias_info)\n", + "print(duplicate_count)\n", + "print(outliers_data)\n", + "print(leakage_info)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Датасет о продажах недвижимости:" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Зашумленность: 0.00%\n", + "Смещение по price: 18.64% уникальных значений\n", + "Количество дубликатов: 0.00%\n", + "Выбросы по yr_renovated: 4.23%\n", + "Признаки просачивания данных: yr_built: 0.36, floors: 0.26, sqft_basement: 0.17, sqft_above: 0.16, grade: 0.14, bathrooms: 0.12, long: 0.11, sqft_living15: 0.09, yr_renovated: 0.06\n" + ] + } + ], + "source": [ + "noise_columns = check_noise(var6)\n", + "bias_info = check_bias(var6, 'price') \n", + "duplicate_count = check_duplicates(var6)\n", + "outliers_data = check_outliers(var6, 'yr_renovated') \n", + "leakage_info = check_data_leakage(var6, 'condition') \n", + "\n", + "print(noise_columns)\n", + "print(bias_info)\n", + "print(duplicate_count)\n", + "print(outliers_data)\n", + "print(leakage_info)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Датасет о цене мобильных устройств:" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Зашумленность: 2.36%\n", + "Смещение по company: 1.90% уникальных значений\n", + "Количество дубликатов: 0.00%\n", + "Выбросы по Spec_score: 1.24%\n", + "Признаки просачивания данных: Spec_score: 0.06, Unnamed: 0: 0.03\n" + ] + } + ], + "source": [ + "noise_columns = check_noise(var18)\n", + "bias_info = check_bias(var18, 'company') \n", + "duplicate_count = check_duplicates(var18)\n", + "outliers_data = check_outliers(var18, 'Spec_score') \n", + "leakage_info = check_data_leakage(var18, 'Rating') \n", + "\n", + "print(noise_columns)\n", + "print(bias_info)\n", + "print(duplicate_count)\n", + "print(outliers_data)\n", + "print(leakage_info)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 9. Устранить проблему пропущенных данных. Для каждого набора данных использовать разные методы: удаление, подстановка константного значения (0 или подобное), подстановка среднего значения" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "gender 0\n", + "age 0\n", + "hypertension 0\n", + "heart_disease 0\n", + "ever_married 0\n", + "work_type 0\n", + "Residence_type 0\n", + "avg_glucose_level 0\n", + "bmi 201\n", + "smoking_status 0\n", + "stroke 0\n", + "dtype: int64" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Инсульт\n", + "var4.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "var4['bmi'] = var4['bmi'].fillna(var4['bmi'].mean())" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "gender 0\n", + "age 0\n", + "hypertension 0\n", + "heart_disease 0\n", + "ever_married 0\n", + "work_type 0\n", + "Residence_type 0\n", + "avg_glucose_level 0\n", + "bmi 0\n", + "smoking_status 0\n", + "stroke 0\n", + "dtype: int64" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "var4.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "date 0\n", + "price 0\n", + "bedrooms 0\n", + "bathrooms 0\n", + "sqft_living 0\n", + "sqft_lot 0\n", + "floors 0\n", + "waterfront 0\n", + "view 0\n", + "condition 0\n", + "grade 0\n", + "sqft_above 0\n", + "sqft_basement 0\n", + "yr_built 0\n", + "yr_renovated 0\n", + "zipcode 0\n", + "lat 0\n", + "long 0\n", + "sqft_living15 0\n", + "sqft_lot15 0\n", + "dtype: int64" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Дома\n", + "var6.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Unnamed: 0 0\n", + "Name 0\n", + "Rating 0\n", + "Spec_score 0\n", + "No_of_sim 0\n", + "Ram 0\n", + "Battery 0\n", + "Display 0\n", + "Camera 0\n", + "External_Memory 0\n", + "Android_version 443\n", + "Price 0\n", + "company 0\n", + "Inbuilt_memory 19\n", + "fast_charging 89\n", + "Screen_resolution 2\n", + "Processor 28\n", + "Processor_name 0\n", + "dtype: int64" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Мобильные устройства\n", + "var18.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "var18['Android_version'] = var18['Android_version'].fillna('No info')\n", + "var18['Inbuilt_memory'] = var18['Android_version'].fillna('No info')\n", + "var18['fast_charging'] = var18['Android_version'].fillna('No info')\n", + "var18['Screen_resolution'] = var18['Android_version'].fillna('No info')\n", + "var18['Processor'] = var18['Android_version'].fillna('No info')" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Unnamed: 0 0\n", + "Name 0\n", + "Rating 0\n", + "Spec_score 0\n", + "No_of_sim 0\n", + "Ram 0\n", + "Battery 0\n", + "Display 0\n", + "Camera 0\n", + "External_Memory 0\n", + "Android_version 0\n", + "Price 0\n", + "company 0\n", + "Inbuilt_memory 0\n", + "fast_charging 0\n", + "Screen_resolution 0\n", + "Processor 0\n", + "Processor_name 0\n", + "dtype: int64" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "var18.isnull().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 10. Выполнить разбиение каждого набора данных на обучающую, контрольную и тестовую выборки¶\n" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "var4 Dataset:\n", + "Train: 80.00%\n", + "Validation: 10.00%\n", + "Test: 10.00%\n", + "\n" + ] + } + ], + "source": [ + "# Разбиение var4 (Инсульт)\n", + "\n", + "original_var4_size = len(var4)\n", + "train_var4, temp_var4 = train_test_split(var4, test_size=0.2, random_state=42)\n", + "val_var4, test_var4 = train_test_split(temp_var4, test_size=0.5, random_state=42)\n", + "\n", + "print(\"var4 Dataset:\")\n", + "print(f\"Train: {len(train_var4)/original_var4_size*100:.2f}%\")\n", + "print(f\"Validation: {len(val_var4)/original_var4_size*100:.2f}%\")\n", + "print(f\"Test: {len(test_var4)/original_var4_size*100:.2f}%\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "var6 Dataset:\n", + "Train: 80.00%\n", + "Validation: 10.00%\n", + "Test: 10.00%\n", + "\n" + ] + } + ], + "source": [ + "# Разбиение var6 (Дома)\n", + "original_var6_size = len(var6)\n", + "train_var6, temp_var6 = train_test_split(var6, test_size=0.2, random_state=42)\n", + "val_var6, test_var6 = train_test_split(temp_var6, test_size=0.5, random_state=42)\n", + "\n", + "print(\"var6 Dataset:\")\n", + "print(f\"Train: {len(train_var6)/original_var6_size*100:.2f}%\")\n", + "print(f\"Validation: {len(val_var6)/original_var6_size*100:.2f}%\")\n", + "print(f\"Test: {len(test_var6)/original_var6_size*100:.2f}%\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "var18 Dataset:\n", + "Train: 80.00%\n", + "Validation: 10.00%\n", + "Test: 10.00%\n", + "\n" + ] + } + ], + "source": [ + "# Разбиение var18 (Мобильные устройства)\n", + "original_var18_size = len(var18)\n", + "train_var18, temp_var18 = train_test_split(var18, test_size=0.2, random_state=42)\n", + "val_var18, test_var18 = train_test_split(temp_var18, test_size=0.5, random_state=42)\n", + "\n", + "print(\"var18 Dataset:\")\n", + "print(f\"Train: {len(train_var18)/original_var18_size*100:.2f}%\")\n", + "print(f\"Validation: {len(val_var18)/original_var18_size*100:.2f}%\")\n", + "print(f\"Test: {len(test_var18)/original_var18_size*100:.2f}%\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 11. Оценить сбалансированность выборок для каждого набора данных. Оценить необходимость использования методов приращения (аугментации) данных." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 12. Выполнить приращение данных методами выборки с избытком (oversampling) и выборки с недостатком (undersampling). Должны быть представлены примеры реализации обоих методов для выборок каждого набора данных." + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def plot_sample_balance(y, sample_name):\n", + " plt.figure(figsize=(8, 5))\n", + " sns.histplot(y, bins=30, kde=True)\n", + " plt.title(f'Распределение целевой переменной для {sample_name}')\n", + " plt.xlabel(sample_name)\n", + " plt.ylabel('Частота')\n", + " plt.show()\n", + "\n", + "# Оценка сбалансированности выборок\n", + "plot_sample_balance(train_var6['price'], 'Train var6')\n", + "plot_sample_balance(val_var6['price'], 'Validation var6')\n", + "plot_sample_balance(test_var6['price'], 'Test var6')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Распределения выборок у данного датасета выглядят схоже. Это говорит о сбалансированности выборок." + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_sample_balance(train_var4['stroke'], 'Train var4')\n", + "plot_sample_balance(val_var4['stroke'], 'Validation var4')\n", + "plot_sample_balance(test_var4['stroke'], 'Test var4')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Выборки выглядят схоже, но у всех трех имеется явный дисбаланс классов. Это проблема, т.к в дальнейшем не сможем обучить какую-либо модель." + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_sample_balance(train_var18['Spec_score'], 'Train var18')\n", + "plot_sample_balance(val_var18['Spec_score'], 'Validation var18')\n", + "plot_sample_balance(test_var18['Spec_score'], 'Test var18')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Распределения выборок у данного датасета выглядят схоже. Это говорит о сбалансированности выборок. Однако в тренировочной выборке значительно больший размах значений" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 12. Выполнить приращение данных методами выборки с избытком (oversampling) и выборки с недостатком (undersampling). Должны быть представлены примеры реализации обоих методов для выборок каждого набора данных\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Инсульт" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "После oversampling (var4): stroke\n", + "1 4861\n", + "0 4861\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "from imblearn.over_sampling import SMOTE\n", + "\n", + "X_var4 = var4.drop('stroke', axis=1)\n", + "y_var4 = var4['stroke']\n", + "\n", + "# Кодирование категориальных признаков\n", + "for column in X_var4.select_dtypes(include=['object']).columns:\n", + " X_var4[column] = X_var4[column].astype('category').cat.codes\n", + "\n", + "# Теперь применяем SMOTE\n", + "smote = SMOTE(random_state=42)\n", + "X_resampled_var4, y_resampled_var4 = smote.fit_resample(X_var4, y_var4)\n", + "\n", + "# Получаем результаты\n", + "print(f'После oversampling (var4): {pd.Series(y_resampled_var4).value_counts()}')" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "После undersampling (var4): stroke\n", + "0 249\n", + "1 249\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "from imblearn.under_sampling import RandomUnderSampler\n", + "\n", + "# Undersampling для var4\n", + "undersample = RandomUnderSampler(random_state=42)\n", + "X_under_var4, y_under_var4 = undersample.fit_resample(X_var4, y_var4)\n", + "\n", + "print(f'После undersampling (var4): {pd.Series(y_under_var4).value_counts()}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Дома" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "\"['Price'] not found in axis\"", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[65], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m X_var6 \u001b[38;5;241m=\u001b[39m \u001b[43mvar6\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mPrice\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m y_var6 \u001b[38;5;241m=\u001b[39m var6[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mPrice\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m 4\u001b[0m \u001b[38;5;66;03m# Кодирование категориальных признаков\u001b[39;00m\n", + "File \u001b[1;32mc:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\pandas\\core\\frame.py:5581\u001b[0m, in \u001b[0;36mDataFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m 5433\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdrop\u001b[39m(\n\u001b[0;32m 5434\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 5435\u001b[0m labels: IndexLabel \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 5442\u001b[0m errors: IgnoreRaise \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraise\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 5443\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 5444\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 5445\u001b[0m \u001b[38;5;124;03m Drop specified labels from rows or columns.\u001b[39;00m\n\u001b[0;32m 5446\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 5579\u001b[0m \u001b[38;5;124;03m weight 1.0 0.8\u001b[39;00m\n\u001b[0;32m 5580\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 5581\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 5582\u001b[0m \u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5583\u001b[0m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5584\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5585\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5586\u001b[0m \u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5587\u001b[0m \u001b[43m \u001b[49m\u001b[43minplace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minplace\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5588\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5589\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\pandas\\core\\generic.py:4788\u001b[0m, in \u001b[0;36mNDFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m 4786\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m axis, labels \u001b[38;5;129;01min\u001b[39;00m axes\u001b[38;5;241m.\u001b[39mitems():\n\u001b[0;32m 4787\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m labels \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m-> 4788\u001b[0m obj \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_drop_axis\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4790\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inplace:\n\u001b[0;32m 4791\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_inplace(obj)\n", + "File \u001b[1;32mc:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\pandas\\core\\generic.py:4830\u001b[0m, in \u001b[0;36mNDFrame._drop_axis\u001b[1;34m(self, labels, axis, level, errors, only_slice)\u001b[0m\n\u001b[0;32m 4828\u001b[0m new_axis \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mdrop(labels, level\u001b[38;5;241m=\u001b[39mlevel, errors\u001b[38;5;241m=\u001b[39merrors)\n\u001b[0;32m 4829\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 4830\u001b[0m new_axis \u001b[38;5;241m=\u001b[39m \u001b[43maxis\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4831\u001b[0m indexer \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mget_indexer(new_axis)\n\u001b[0;32m 4833\u001b[0m \u001b[38;5;66;03m# Case for non-unique axis\u001b[39;00m\n\u001b[0;32m 4834\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "File \u001b[1;32mc:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:7070\u001b[0m, in \u001b[0;36mIndex.drop\u001b[1;34m(self, labels, errors)\u001b[0m\n\u001b[0;32m 7068\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m mask\u001b[38;5;241m.\u001b[39many():\n\u001b[0;32m 7069\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m-> 7070\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlabels[mask]\u001b[38;5;241m.\u001b[39mtolist()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not found in axis\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 7071\u001b[0m indexer \u001b[38;5;241m=\u001b[39m indexer[\u001b[38;5;241m~\u001b[39mmask]\n\u001b[0;32m 7072\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdelete(indexer)\n", + "\u001b[1;31mKeyError\u001b[0m: \"['Price'] not found in axis\"" + ] + } + ], + "source": [ + "X_var6 = var6.drop('Price', axis=1)\n", + "y_var6 = var6['Price']\n", + "\n", + "# Кодирование категориальных признаков\n", + "for column in X_var6.select_dtypes(include=['object']).columns:\n", + " X_var6[column] = X_var6[column].astype('category').cat.codes\n", + "\n", + "# Теперь применяем SMOTE\n", + "smote = SMOTE(random_state=42)\n", + "X_resampled_var6, y_resampled_var6 = smote.fit_resample(X_var6, y_var6)\n", + "\n", + "# Получаем результаты\n", + "print(f'После oversampling (var6): {pd.Series(y_resampled_var6).value_counts()}')" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "Expected n_neighbors <= n_samples_fit, but n_neighbors = 6, n_samples_fit = 1, n_samples = 1", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[69], line 10\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[38;5;66;03m# Теперь применяем SMOTE\u001b[39;00m\n\u001b[0;32m 9\u001b[0m smote \u001b[38;5;241m=\u001b[39m SMOTE(random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m42\u001b[39m)\n\u001b[1;32m---> 10\u001b[0m X_resampled_var18, y_resampled_var18 \u001b[38;5;241m=\u001b[39m \u001b[43msmote\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_resample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_var18\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_var18\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;66;03m# Получаем результаты\u001b[39;00m\n\u001b[0;32m 13\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mПосле oversampling (var18): \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpd\u001b[38;5;241m.\u001b[39mSeries(y_resampled_var18)\u001b[38;5;241m.\u001b[39mvalue_counts()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n", + "File \u001b[1;32mc:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\imblearn\\base.py:208\u001b[0m, in \u001b[0;36mBaseSampler.fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 187\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Resample the dataset.\u001b[39;00m\n\u001b[0;32m 188\u001b[0m \n\u001b[0;32m 189\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 205\u001b[0m \u001b[38;5;124;03m The corresponding label of `X_resampled`.\u001b[39;00m\n\u001b[0;32m 206\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[1;32m--> 208\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_resample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\imblearn\\base.py:112\u001b[0m, in \u001b[0;36mSamplerMixin.fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 106\u001b[0m X, y, binarize_y \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_X_y(X, y)\n\u001b[0;32m 108\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy_ \u001b[38;5;241m=\u001b[39m check_sampling_strategy(\n\u001b[0;32m 109\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy, y, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sampling_type\n\u001b[0;32m 110\u001b[0m )\n\u001b[1;32m--> 112\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_resample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 114\u001b[0m y_ \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 115\u001b[0m label_binarize(output[\u001b[38;5;241m1\u001b[39m], classes\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39munique(y)) \u001b[38;5;28;01mif\u001b[39;00m binarize_y \u001b[38;5;28;01melse\u001b[39;00m output[\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m 116\u001b[0m )\n\u001b[0;32m 118\u001b[0m X_, y_ \u001b[38;5;241m=\u001b[39m arrays_transformer\u001b[38;5;241m.\u001b[39mtransform(output[\u001b[38;5;241m0\u001b[39m], y_)\n", + "File \u001b[1;32mc:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\imblearn\\over_sampling\\_smote\\base.py:389\u001b[0m, in \u001b[0;36mSMOTE._fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 386\u001b[0m X_class \u001b[38;5;241m=\u001b[39m _safe_indexing(X, target_class_indices)\n\u001b[0;32m 388\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnn_k_\u001b[38;5;241m.\u001b[39mfit(X_class)\n\u001b[1;32m--> 389\u001b[0m nns \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnn_k_\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkneighbors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_class\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_distance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m[:, \u001b[38;5;241m1\u001b[39m:]\n\u001b[0;32m 390\u001b[0m X_new, y_new \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_samples(\n\u001b[0;32m 391\u001b[0m X_class, y\u001b[38;5;241m.\u001b[39mdtype, class_sample, X_class, nns, n_samples, \u001b[38;5;241m1.0\u001b[39m\n\u001b[0;32m 392\u001b[0m )\n\u001b[0;32m 393\u001b[0m X_resampled\u001b[38;5;241m.\u001b[39mappend(X_new)\n", + "File \u001b[1;32mc:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\sklearn\\neighbors\\_base.py:834\u001b[0m, in \u001b[0;36mKNeighborsMixin.kneighbors\u001b[1;34m(self, X, n_neighbors, return_distance)\u001b[0m\n\u001b[0;32m 832\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 833\u001b[0m inequality_str \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mn_neighbors <= n_samples_fit\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m--> 834\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 835\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExpected \u001b[39m\u001b[38;5;132;01m{\u001b[39;00minequality_str\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, but \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 836\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mn_neighbors = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mn_neighbors\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, n_samples_fit = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mn_samples_fit\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 837\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mn_samples = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mX\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;66;03m# include n_samples for common tests\u001b[39;00m\n\u001b[0;32m 838\u001b[0m )\n\u001b[0;32m 840\u001b[0m n_jobs \u001b[38;5;241m=\u001b[39m effective_n_jobs(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_jobs)\n\u001b[0;32m 841\u001b[0m chunked_results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "\u001b[1;31mValueError\u001b[0m: Expected n_neighbors <= n_samples_fit, but n_neighbors = 6, n_samples_fit = 1, n_samples = 1" + ] + } + ], + "source": [ + "X_var18 = var18.drop('Price', axis=1)\n", + "y_var18 = var18['Price']\n", + "\n", + "# Кодирование категориальных признаков\n", + "for column in X_var18.select_dtypes(include=['object']).columns:\n", + " X_var18[column] = X_var18[column].astype('category').cat.codes\n", + "\n", + "# Теперь применяем SMOTE\n", + "smote = SMOTE(random_state=42)\n", + "X_resampled_var18, y_resampled_var18 = smote.fit_resample(X_var18, y_var18)\n", + "\n", + "# Получаем результаты\n", + "print(f'После oversampling (var18): {pd.Series(y_resampled_var18).value_counts()}')" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "После undersampling (var4): stroke\n", + "0 249\n", + "1 249\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "from imblearn.under_sampling import RandomUnderSampler\n", + "\n", + "# Undersampling для var4\n", + "undersample = RandomUnderSampler(random_state=42)\n", + "X_under_var4, y_under_var4 = undersample.fit_resample(X_var4, y_var4)\n", + "\n", + "print(f'После undersampling (var4): {pd.Series(y_under_var4).value_counts()}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В данном случае у нас есть только один датасет, предназначенный для решения задачи классификации (инсульт). Проблему дисбаланса в нем мы решили применив undersampling & oversampling.\n", + "\n", + "Два остальных датасета не содержат классов, т.к предназначены для решения задачи регрессии (предсказания цен на недвижимость или цены мобильного устройства), поэтому выполнять приращение данных не требуется." + ] } ], "metadata": { diff --git a/poetry.lock b/poetry.lock index 71a29fe..0ab2c91 100644 --- a/poetry.lock +++ b/poetry.lock @@ -366,6 +366,44 @@ ufo = ["fs (>=2.2.0,<3)"] unicode = ["unicodedata2 (>=15.1.0)"] woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"] +[[package]] +name = "imbalanced-learn" +version = "0.12.4" +description = "Toolbox for imbalanced dataset in machine learning." +optional = false +python-versions = "*" +files = [ + {file = "imbalanced-learn-0.12.4.tar.gz", hash = "sha256:8153ba385d296b07d97e0901a2624a86c06b48c94c2f92da3a5354827697b7a3"}, + {file = "imbalanced_learn-0.12.4-py3-none-any.whl", hash = "sha256:d47fc599160d3ea882e712a3a6b02bdd353c1a6436d8d68d41b1922e6ee4a703"}, +] + +[package.dependencies] +joblib = ">=1.1.1" +numpy = ">=1.17.3" +scikit-learn = ">=1.0.2" +scipy = ">=1.5.0" +threadpoolctl = ">=2.0.0" + +[package.extras] +docs = ["keras (>=2.4.3)", "matplotlib (>=3.1.2)", "memory-profiler (>=0.57.0)", "numpydoc (>=1.5.0)", "pandas (>=1.0.5)", "pydata-sphinx-theme (>=0.13.3)", "seaborn (>=0.9.0)", "sphinx (>=6.0.0)", "sphinx-copybutton (>=0.5.2)", "sphinx-design (>=0.5.0)", "sphinx-gallery (>=0.13.0)", "sphinxcontrib-bibtex (>=2.4.1)", "tensorflow (>=2.4.3)"] +examples = ["keras (>=2.4.3)", "matplotlib (>=3.1.2)", "pandas (>=1.0.5)", "seaborn (>=0.9.0)", "tensorflow (>=2.4.3)"] +optional = ["keras (>=2.4.3)", "pandas (>=1.0.5)", "tensorflow (>=2.4.3)"] +tests = ["black (>=23.3.0)", "flake8 (>=3.8.2)", "keras (>=2.4.3)", "mypy (>=1.3.0)", "pandas (>=1.0.5)", "pytest (>=5.0.1)", "pytest-cov (>=2.9.0)", "tensorflow (>=2.4.3)"] + +[[package]] +name = "imblearn" +version = "0.0" +description = "Toolbox for imbalanced dataset in machine learning." +optional = false +python-versions = "*" +files = [ + {file = "imblearn-0.0-py2.py3-none-any.whl", hash = "sha256:d42c2d709d22c00d2b9a91e638d57240a8b79b4014122d92181fcd2549a2f79a"}, + {file = "imblearn-0.0.tar.gz", hash = "sha256:d8fbb662919c1b16f438ad91a8256220e53bcf6815c9ad5502c518b798de34f2"}, +] + +[package.dependencies] +imbalanced-learn = "*" + [[package]] name = "ipykernel" version = "6.29.5" @@ -454,6 +492,17 @@ docs = ["Jinja2 (==2.11.3)", "MarkupSafe (==1.1.1)", "Pygments (==2.8.1)", "alab qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"] testing = ["Django", "attrs", "colorama", "docopt", "pytest (<7.0.0)"] +[[package]] +name = "joblib" +version = "1.4.2" +description = "Lightweight pipelining with Python functions" +optional = false +python-versions = ">=3.8" +files = [ + {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"}, + {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"}, +] + [[package]] name = "jupyter-client" version = "8.6.3" @@ -1284,6 +1333,127 @@ files = [ [package.dependencies] cffi = {version = "*", markers = "implementation_name == \"pypy\""} +[[package]] +name = "scikit-learn" +version = "1.5.2" +description = "A set of python modules for machine learning and data mining" +optional = false +python-versions = ">=3.9" +files = [ + {file = "scikit_learn-1.5.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:299406827fb9a4f862626d0fe6c122f5f87f8910b86fe5daa4c32dcd742139b6"}, + {file = "scikit_learn-1.5.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:2d4cad1119c77930b235579ad0dc25e65c917e756fe80cab96aa3b9428bd3fb0"}, + {file = "scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c412ccc2ad9bf3755915e3908e677b367ebc8d010acbb3f182814524f2e5540"}, + {file = "scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a686885a4b3818d9e62904d91b57fa757fc2bed3e465c8b177be652f4dd37c8"}, + {file = "scikit_learn-1.5.2-cp310-cp310-win_amd64.whl", hash = "sha256:c15b1ca23d7c5f33cc2cb0a0d6aaacf893792271cddff0edbd6a40e8319bc113"}, + {file = "scikit_learn-1.5.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:03b6158efa3faaf1feea3faa884c840ebd61b6484167c711548fce208ea09445"}, + {file = "scikit_learn-1.5.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:1ff45e26928d3b4eb767a8f14a9a6efbf1cbff7c05d1fb0f95f211a89fd4f5de"}, + {file = "scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f763897fe92d0e903aa4847b0aec0e68cadfff77e8a0687cabd946c89d17e675"}, + {file = "scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8b0ccd4a902836493e026c03256e8b206656f91fbcc4fde28c57a5b752561f1"}, + {file = "scikit_learn-1.5.2-cp311-cp311-win_amd64.whl", hash = "sha256:6c16d84a0d45e4894832b3c4d0bf73050939e21b99b01b6fd59cbb0cf39163b6"}, + {file = "scikit_learn-1.5.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f932a02c3f4956dfb981391ab24bda1dbd90fe3d628e4b42caef3e041c67707a"}, + {file = "scikit_learn-1.5.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:3b923d119d65b7bd555c73be5423bf06c0105678ce7e1f558cb4b40b0a5502b1"}, + {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, + {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, + {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"}, + {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"}, + {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, + {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, + {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, + {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca64b3089a6d9b9363cd3546f8978229dcbb737aceb2c12144ee3f70f95684b7"}, + {file = "scikit_learn-1.5.2-cp39-cp39-win_amd64.whl", hash = "sha256:3bed4909ba187aca80580fe2ef370d9180dcf18e621a27c4cf2ef10d279a7efe"}, + {file = "scikit_learn-1.5.2.tar.gz", hash = "sha256:b4237ed7b3fdd0a4882792e68ef2545d5baa50aca3bb45aa7df468138ad8f94d"}, +] + +[package.dependencies] +joblib = ">=1.2.0" +numpy = ">=1.19.5" +scipy = ">=1.6.0" +threadpoolctl = ">=3.1.0" + +[package.extras] +benchmark = ["matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "pandas (>=1.1.5)"] +build = ["cython (>=3.0.10)", "meson-python (>=0.16.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)"] +docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pydata-sphinx-theme (>=0.15.3)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)", "sphinx (>=7.3.7)", "sphinx-copybutton (>=0.5.2)", "sphinx-design (>=0.5.0)", "sphinx-design (>=0.6.0)", "sphinx-gallery (>=0.16.0)", "sphinx-prompt (>=1.4.0)", "sphinx-remove-toctrees (>=1.0.0.post1)", "sphinxcontrib-sass (>=0.3.4)", "sphinxext-opengraph (>=0.9.1)"] +examples = ["matplotlib (>=3.3.4)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)"] +install = ["joblib (>=1.2.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)", "threadpoolctl (>=3.1.0)"] +maintenance = ["conda-lock (==2.5.6)"] +tests = ["black (>=24.3.0)", "matplotlib (>=3.3.4)", "mypy (>=1.9)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pyarrow (>=12.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.2.1)", "scikit-image (>=0.17.2)"] + +[[package]] +name = "scipy" +version = "1.14.1" +description = "Fundamental algorithms for scientific computing in Python" +optional = false +python-versions = ">=3.10" +files = [ + {file = "scipy-1.14.1-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:b28d2ca4add7ac16ae8bb6632a3c86e4b9e4d52d3e34267f6e1b0c1f8d87e389"}, + {file = "scipy-1.14.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d0d2821003174de06b69e58cef2316a6622b60ee613121199cb2852a873f8cf3"}, + {file = "scipy-1.14.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8bddf15838ba768bb5f5083c1ea012d64c9a444e16192762bd858f1e126196d0"}, + {file = "scipy-1.14.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:97c5dddd5932bd2a1a31c927ba5e1463a53b87ca96b5c9bdf5dfd6096e27efc3"}, + {file = "scipy-1.14.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ff0a7e01e422c15739ecd64432743cf7aae2b03f3084288f399affcefe5222d"}, + {file = "scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8e32dced201274bf96899e6491d9ba3e9a5f6b336708656466ad0522d8528f69"}, + {file = "scipy-1.14.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8426251ad1e4ad903a4514712d2fa8fdd5382c978010d1c6f5f37ef286a713ad"}, + {file = "scipy-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:a49f6ed96f83966f576b33a44257d869756df6cf1ef4934f59dd58b25e0327e5"}, + {file = "scipy-1.14.1-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:2da0469a4ef0ecd3693761acbdc20f2fdeafb69e6819cc081308cc978153c675"}, + {file = "scipy-1.14.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:c0ee987efa6737242745f347835da2cc5bb9f1b42996a4d97d5c7ff7928cb6f2"}, + {file = "scipy-1.14.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3a1b111fac6baec1c1d92f27e76511c9e7218f1695d61b59e05e0fe04dc59617"}, + {file = "scipy-1.14.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8475230e55549ab3f207bff11ebfc91c805dc3463ef62eda3ccf593254524ce8"}, + {file = "scipy-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:278266012eb69f4a720827bdd2dc54b2271c97d84255b2faaa8f161a158c3b37"}, + {file = "scipy-1.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fef8c87f8abfb884dac04e97824b61299880c43f4ce675dd2cbeadd3c9b466d2"}, + {file = "scipy-1.14.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b05d43735bb2f07d689f56f7b474788a13ed8adc484a85aa65c0fd931cf9ccd2"}, + {file = "scipy-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:716e389b694c4bb564b4fc0c51bc84d381735e0d39d3f26ec1af2556ec6aad94"}, + {file = "scipy-1.14.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:631f07b3734d34aced009aaf6fedfd0eb3498a97e581c3b1e5f14a04164a456d"}, + {file = "scipy-1.14.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:af29a935803cc707ab2ed7791c44288a682f9c8107bc00f0eccc4f92c08d6e07"}, + {file = "scipy-1.14.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:2843f2d527d9eebec9a43e6b406fb7266f3af25a751aa91d62ff416f54170bc5"}, + {file = "scipy-1.14.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:eb58ca0abd96911932f688528977858681a59d61a7ce908ffd355957f7025cfc"}, + {file = "scipy-1.14.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:30ac8812c1d2aab7131a79ba62933a2a76f582d5dbbc695192453dae67ad6310"}, + {file = "scipy-1.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f9ea80f2e65bdaa0b7627fb00cbeb2daf163caa015e59b7516395fe3bd1e066"}, + {file = "scipy-1.14.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:edaf02b82cd7639db00dbff629995ef185c8df4c3ffa71a5562a595765a06ce1"}, + {file = "scipy-1.14.1-cp312-cp312-win_amd64.whl", hash = "sha256:2ff38e22128e6c03ff73b6bb0f85f897d2362f8c052e3b8ad00532198fbdae3f"}, + {file = "scipy-1.14.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1729560c906963fc8389f6aac023739ff3983e727b1a4d87696b7bf108316a79"}, + {file = "scipy-1.14.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:4079b90df244709e675cdc8b93bfd8a395d59af40b72e339c2287c91860deb8e"}, + {file = "scipy-1.14.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:e0cf28db0f24a38b2a0ca33a85a54852586e43cf6fd876365c86e0657cfe7d73"}, + {file = "scipy-1.14.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:0c2f95de3b04e26f5f3ad5bb05e74ba7f68b837133a4492414b3afd79dfe540e"}, + {file = "scipy-1.14.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b99722ea48b7ea25e8e015e8341ae74624f72e5f21fc2abd45f3a93266de4c5d"}, + {file = "scipy-1.14.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5149e3fd2d686e42144a093b206aef01932a0059c2a33ddfa67f5f035bdfe13e"}, + {file = "scipy-1.14.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e4f5a7c49323533f9103d4dacf4e4f07078f360743dec7f7596949149efeec06"}, + {file = "scipy-1.14.1-cp313-cp313-win_amd64.whl", hash = "sha256:baff393942b550823bfce952bb62270ee17504d02a1801d7fd0719534dfb9c84"}, + {file = "scipy-1.14.1.tar.gz", hash = "sha256:5a275584e726026a5699459aa72f828a610821006228e841b94275c4a7c08417"}, +] + +[package.dependencies] +numpy = ">=1.23.5,<2.3" + +[package.extras] +dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy (==1.10.0)", "pycodestyle", "pydevtool", "rich-click", "ruff (>=0.0.292)", "types-psutil", "typing_extensions"] +doc = ["jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.13.1)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<=7.3.7)", "sphinx-design (>=0.4.0)"] +test = ["Cython", "array-api-strict (>=2.0)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] + +[[package]] +name = "seaborn" +version = "0.13.2" +description = "Statistical data visualization" +optional = false +python-versions = ">=3.8" +files = [ + {file = "seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987"}, + {file = "seaborn-0.13.2.tar.gz", hash = "sha256:93e60a40988f4d65e9f4885df477e2fdaff6b73a9ded434c1ab356dd57eefff7"}, +] + +[package.dependencies] +matplotlib = ">=3.4,<3.6.1 || >3.6.1" +numpy = ">=1.20,<1.24.0 || >1.24.0" +pandas = ">=1.2" + +[package.extras] +dev = ["flake8", "flit", "mypy", "pandas-stubs", "pre-commit", "pytest", "pytest-cov", "pytest-xdist"] +docs = ["ipykernel", "nbconvert", "numpydoc", "pydata_sphinx_theme (==0.10.0rc2)", "pyyaml", "sphinx (<6.0.0)", "sphinx-copybutton", "sphinx-design", "sphinx-issues"] +stats = ["scipy (>=1.7)", "statsmodels (>=0.12)"] + [[package]] name = "six" version = "1.16.0" @@ -1314,6 +1484,17 @@ pure-eval = "*" [package.extras] tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"] +[[package]] +name = "threadpoolctl" +version = "3.5.0" +description = "threadpoolctl" +optional = false +python-versions = ">=3.8" +files = [ + {file = "threadpoolctl-3.5.0-py3-none-any.whl", hash = "sha256:56c1e26c150397e58c4926da8eeee87533b1e32bef131bd4bf6a2f45f3185467"}, + {file = "threadpoolctl-3.5.0.tar.gz", hash = "sha256:082433502dd922bf738de0d8bcc4fdcbf0979ff44c42bd40f5af8a282f6fa107"}, +] + [[package]] name = "tornado" version = "6.4.1" @@ -1374,4 +1555,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "c64b09f7679bf7188ea5bf1c9bdfaf15eca7d0ba61aadb336111b13ef1633f13" +content-hash = "0873cc703854cb4dbaf70e8bdffeb557c2216e6414bb7fe5ebe42e93349ebdb1" diff --git a/pyproject.toml b/pyproject.toml index 265fa58..a70ab91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,10 @@ readme = "README.md" python = "^3.12" pandas = "^2.2.3" matplotlib = "^3.9.2" +seaborn = "^0.13.2" +numpy = "^2.1.2" +scikit-learn = "^1.5.2" +imblearn = "^0.0" [tool.poetry.group.dev.dependencies]