From bfd498e75d51e8c63df2637417183d0b78adbb67 Mon Sep 17 00:00:00 2001 From: MaD Date: Sat, 21 Dec 2024 02:37:45 +0400 Subject: [PATCH] =?UTF-8?q?=D0=B3=D0=B8=D1=82=20=D1=81=D0=BE=D0=B6=D1=80?= =?UTF-8?q?=D0=B0=D0=BB=20=D0=B2=D1=82=D0=BE=D1=80=D1=83=D1=8E=20=D0=BB?= =?UTF-8?q?=D0=B0=D0=B1=D1=83,=20=D0=BD=D0=B0=D0=B1=D1=80=D0=BE=D1=81?= =?UTF-8?q?=D0=B0=D0=BB=20=D0=BD=D0=B0=20=D0=BA=D0=BE=D0=BB=D0=B5=D0=BD?= =?UTF-8?q?=D0=BA=D0=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Lab_2/lab2.ipynb | 609 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 609 insertions(+) diff --git a/Lab_2/lab2.ipynb b/Lab_2/lab2.ipynb index e69de29..d975e54 100644 --- a/Lab_2/lab2.ipynb +++ b/Lab_2/lab2.ipynb @@ -0,0 +1,609 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Шаг 1. Анализ сведений о каждом наборе данных со страницы Kaggle\n", + "### 1.1 Оценки студентов на экзаменах:\n", + "Тематика: Анализ образовательной успеваемости.\n", + "Объекты наблюдения: Студенты.\n", + "Атрибуты: Пол, этническая принадлежность, уровень образования родителей, подготовка к тесту, оценки по математике, чтению и письму.\n", + "Цель: Понять факторы, влияющие на академическую успеваемость.\n", + "### 1.2 Цены на бриллианты:\n", + "Тематика: Рынок драгоценных камней.\n", + "Объекты наблюдения: Бриллианты.\n", + "Атрибуты: Вес (карат), цвет, чистота, качество огранки, цена.\n", + "Цель: Предсказать цену на основе характеристик.\n", + "### 1.3 Данные по инсультам:\n", + "Тематика: Здоровье.\n", + "Объекты наблюдения: Пациенты.\n", + "Атрибуты: Пол, возраст, наличие хронических заболеваний, привычки (например, курение), наличие инсульта.\n", + "Цель: Предсказать вероятность инсульта." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "<>:3: SyntaxWarning: invalid escape sequence '\\S'\n", + "<>:4: SyntaxWarning: invalid escape sequence '\\D'\n", + "<>:5: SyntaxWarning: invalid escape sequence '\\h'\n", + "<>:3: SyntaxWarning: invalid escape sequence '\\S'\n", + "<>:4: SyntaxWarning: invalid escape sequence '\\D'\n", + "<>:5: SyntaxWarning: invalid escape sequence '\\h'\n", + "C:\\Users\\MaD\\AppData\\Local\\Temp\\ipykernel_7684\\1678678831.py:3: SyntaxWarning: invalid escape sequence '\\S'\n", + " students_data = pd.read_csv(\"../data\\StudentsPerformance.csv\")\n", + "C:\\Users\\MaD\\AppData\\Local\\Temp\\ipykernel_7684\\1678678831.py:4: SyntaxWarning: invalid escape sequence '\\D'\n", + " diamonds_data = pd.read_csv(\"../data\\Diamonds Prices2022.csv\")\n", + "C:\\Users\\MaD\\AppData\\Local\\Temp\\ipykernel_7684\\1678678831.py:5: SyntaxWarning: invalid escape sequence '\\h'\n", + " stroke_data = pd.read_csv(\"../data\\healthcare-dataset-stroke-data.csv\")\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- Students ---\n", + "\n", + "RangeIndex: 1000 entries, 0 to 999\n", + "Data columns (total 8 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 gender 1000 non-null object\n", + " 1 race/ethnicity 1000 non-null object\n", + " 2 parental level of education 1000 non-null object\n", + " 3 lunch 1000 non-null object\n", + " 4 test preparation course 1000 non-null object\n", + " 5 math score 1000 non-null int64 \n", + " 6 reading score 1000 non-null int64 \n", + " 7 writing score 1000 non-null int64 \n", + "dtypes: int64(3), object(5)\n", + "memory usage: 62.6+ KB\n", + "None\n", + "gender 0\n", + "race/ethnicity 0\n", + "parental level of education 0\n", + "lunch 0\n", + "test preparation course 0\n", + "math score 0\n", + "reading score 0\n", + "writing score 0\n", + "dtype: int64\n", + " math score reading score writing score\n", + "count 1000.00000 1000.000000 1000.000000\n", + "mean 66.08900 69.169000 68.054000\n", + "std 15.16308 14.600192 15.195657\n", + "min 0.00000 17.000000 10.000000\n", + "25% 57.00000 59.000000 57.750000\n", + "50% 66.00000 70.000000 69.000000\n", + "75% 77.00000 79.000000 79.000000\n", + "max 100.00000 100.000000 100.000000\n", + "--- Diamonds ---\n", + "\n", + "RangeIndex: 53943 entries, 0 to 53942\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Unnamed: 0 53943 non-null int64 \n", + " 1 carat 53943 non-null float64\n", + " 2 cut 53943 non-null object \n", + " 3 color 53943 non-null object \n", + " 4 clarity 53943 non-null object \n", + " 5 depth 53943 non-null float64\n", + " 6 table 53943 non-null float64\n", + " 7 price 53943 non-null int64 \n", + " 8 x 53943 non-null float64\n", + " 9 y 53943 non-null float64\n", + " 10 z 53943 non-null float64\n", + "dtypes: float64(6), int64(2), object(3)\n", + "memory usage: 4.5+ MB\n", + "None\n", + "Unnamed: 0 0\n", + "carat 0\n", + "cut 0\n", + "color 0\n", + "clarity 0\n", + "depth 0\n", + "table 0\n", + "price 0\n", + "x 0\n", + "y 0\n", + "z 0\n", + "dtype: int64\n", + " Unnamed: 0 carat depth table price \\\n", + "count 53943.000000 53943.000000 53943.000000 53943.000000 53943.000000 \n", + "mean 26972.000000 0.797935 61.749322 57.457251 3932.734294 \n", + "std 15572.147122 0.473999 1.432626 2.234549 3989.338447 \n", + "min 1.000000 0.200000 43.000000 43.000000 326.000000 \n", + "25% 13486.500000 0.400000 61.000000 56.000000 950.000000 \n", + "50% 26972.000000 0.700000 61.800000 57.000000 2401.000000 \n", + "75% 40457.500000 1.040000 62.500000 59.000000 5324.000000 \n", + "max 53943.000000 5.010000 79.000000 95.000000 18823.000000 \n", + "\n", + " x y z \n", + "count 53943.000000 53943.000000 53943.000000 \n", + "mean 5.731158 5.734526 3.538730 \n", + "std 1.121730 1.142103 0.705679 \n", + "min 0.000000 0.000000 0.000000 \n", + "25% 4.710000 4.720000 2.910000 \n", + "50% 5.700000 5.710000 3.530000 \n", + "75% 6.540000 6.540000 4.040000 \n", + "max 10.740000 58.900000 31.800000 \n", + "--- Stroke ---\n", + "\n", + "RangeIndex: 5110 entries, 0 to 5109\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 5110 non-null int64 \n", + " 1 gender 5110 non-null object \n", + " 2 age 5110 non-null float64\n", + " 3 hypertension 5110 non-null int64 \n", + " 4 heart_disease 5110 non-null int64 \n", + " 5 ever_married 5110 non-null object \n", + " 6 work_type 5110 non-null object \n", + " 7 Residence_type 5110 non-null object \n", + " 8 avg_glucose_level 5110 non-null float64\n", + " 9 bmi 4909 non-null float64\n", + " 10 smoking_status 5110 non-null object \n", + " 11 stroke 5110 non-null int64 \n", + "dtypes: float64(3), int64(4), object(5)\n", + "memory usage: 479.2+ KB\n", + "None\n", + "id 0\n", + "gender 0\n", + "age 0\n", + "hypertension 0\n", + "heart_disease 0\n", + "ever_married 0\n", + "work_type 0\n", + "Residence_type 0\n", + "avg_glucose_level 0\n", + "bmi 201\n", + "smoking_status 0\n", + "stroke 0\n", + "dtype: int64\n", + " id age hypertension heart_disease \\\n", + "count 5110.000000 5110.000000 5110.000000 5110.000000 \n", + "mean 36517.829354 43.226614 0.097456 0.054012 \n", + "std 21161.721625 22.612647 0.296607 0.226063 \n", + "min 67.000000 0.080000 0.000000 0.000000 \n", + "25% 17741.250000 25.000000 0.000000 0.000000 \n", + "50% 36932.000000 45.000000 0.000000 0.000000 \n", + "75% 54682.000000 61.000000 0.000000 0.000000 \n", + "max 72940.000000 82.000000 1.000000 1.000000 \n", + "\n", + " avg_glucose_level bmi stroke \n", + "count 5110.000000 4909.000000 5110.000000 \n", + "mean 106.147677 28.893237 0.048728 \n", + "std 45.283560 7.854067 0.215320 \n", + "min 55.120000 10.300000 0.000000 \n", + "25% 77.245000 23.500000 0.000000 \n", + "50% 91.885000 28.100000 0.000000 \n", + "75% 114.090000 33.100000 0.000000 \n", + "max 271.740000 97.600000 1.000000 \n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "# Загрузка датасетов\n", + "students_data = pd.read_csv(\"../data\\StudentsPerformance.csv\")\n", + "diamonds_data = pd.read_csv(\"../data\\Diamonds Prices2022.csv\")\n", + "stroke_data = pd.read_csv(\"../data\\healthcare-dataset-stroke-data.csv\")\n", + "\n", + "# Информация о наборах данных\n", + "for name, dataset in {\"Students\": students_data, \"Diamonds\": diamonds_data, \"Stroke\": stroke_data}.items():\n", + " print(f\"--- {name} ---\")\n", + " print(dataset.info())\n", + " print(dataset.isnull().sum())\n", + " print(dataset.describe())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Шаг 3. Примеры бизнес-целей\n", + "Оценки студентов: Рекомендации по обучению для повышения успеваемости.\n", + "Цены на бриллианты: Оценка стоимости для продаж.\n", + "Данные по инсультам: повышения эффективности здравоохранения" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Шаг 4. Цели технического проекта\n", + "Примеры:\n", + "Оценки студентов:\n", + "Вход: Данные о студентах.\n", + "Целевой признак: Оценки.\n", + "Цены на бриллианты:\n", + "Вход: Характеристики бриллиантов.\n", + "Целевой признак: Цена.\n", + "Данные по инсультам:\n", + "Вход: Данные о пациентах.\n", + "Целевой признак: Наличие инсульта." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- Students ---\n", + "Пропущенные значения:\n", + " gender 0\n", + "race/ethnicity 0\n", + "parental level of education 0\n", + "lunch 0\n", + "test preparation course 0\n", + "math score 0\n", + "reading score 0\n", + "writing score 0\n", + "dtype: int64\n", + "Выбросы и статистика:\n", + " math score reading score writing score\n", + "count 1000.00000 1000.000000 1000.000000\n", + "mean 66.08900 69.169000 68.054000\n", + "std 15.16308 14.600192 15.195657\n", + "min 0.00000 17.000000 10.000000\n", + "25% 57.00000 59.000000 57.750000\n", + "50% 66.00000 70.000000 69.000000\n", + "75% 77.00000 79.000000 79.000000\n", + "max 100.00000 100.000000 100.000000\n", + "--- Diamonds ---\n", + "Пропущенные значения:\n", + " Unnamed: 0 0\n", + "carat 0\n", + "cut 0\n", + "color 0\n", + "clarity 0\n", + "depth 0\n", + "table 0\n", + "price 0\n", + "x 0\n", + "y 0\n", + "z 0\n", + "dtype: int64\n", + "Выбросы и статистика:\n", + " Unnamed: 0 carat depth table price \\\n", + "count 53943.000000 53943.000000 53943.000000 53943.000000 53943.000000 \n", + "mean 26972.000000 0.797935 61.749322 57.457251 3932.734294 \n", + "std 15572.147122 0.473999 1.432626 2.234549 3989.338447 \n", + "min 1.000000 0.200000 43.000000 43.000000 326.000000 \n", + "25% 13486.500000 0.400000 61.000000 56.000000 950.000000 \n", + "50% 26972.000000 0.700000 61.800000 57.000000 2401.000000 \n", + "75% 40457.500000 1.040000 62.500000 59.000000 5324.000000 \n", + "max 53943.000000 5.010000 79.000000 95.000000 18823.000000 \n", + "\n", + " x y z \n", + "count 53943.000000 53943.000000 53943.000000 \n", + "mean 5.731158 5.734526 3.538730 \n", + "std 1.121730 1.142103 0.705679 \n", + "min 0.000000 0.000000 0.000000 \n", + "25% 4.710000 4.720000 2.910000 \n", + "50% 5.700000 5.710000 3.530000 \n", + "75% 6.540000 6.540000 4.040000 \n", + "max 10.740000 58.900000 31.800000 \n", + "--- Stroke ---\n", + "Пропущенные значения:\n", + " id 0\n", + "gender 0\n", + "age 0\n", + "hypertension 0\n", + "heart_disease 0\n", + "ever_married 0\n", + "work_type 0\n", + "Residence_type 0\n", + "avg_glucose_level 0\n", + "bmi 201\n", + "smoking_status 0\n", + "stroke 0\n", + "dtype: int64\n", + "Выбросы и статистика:\n", + " id age hypertension heart_disease \\\n", + "count 5110.000000 5110.000000 5110.000000 5110.000000 \n", + "mean 36517.829354 43.226614 0.097456 0.054012 \n", + "std 21161.721625 22.612647 0.296607 0.226063 \n", + "min 67.000000 0.080000 0.000000 0.000000 \n", + "25% 17741.250000 25.000000 0.000000 0.000000 \n", + "50% 36932.000000 45.000000 0.000000 0.000000 \n", + "75% 54682.000000 61.000000 0.000000 0.000000 \n", + "max 72940.000000 82.000000 1.000000 1.000000 \n", + "\n", + " avg_glucose_level bmi stroke \n", + "count 5110.000000 4909.000000 5110.000000 \n", + "mean 106.147677 28.893237 0.048728 \n", + "std 45.283560 7.854067 0.215320 \n", + "min 55.120000 10.300000 0.000000 \n", + "25% 77.245000 23.500000 0.000000 \n", + "50% 91.885000 28.100000 0.000000 \n", + "75% 114.090000 33.100000 0.000000 \n", + "max 271.740000 97.600000 1.000000 \n" + ] + } + ], + "source": [ + "# Анализ выбросов и пропущенных данных\n", + "for name, dataset in {\"Students\": students_data, \"Diamonds\": diamonds_data, \"Stroke\": stroke_data}.items():\n", + " print(f\"--- {name} ---\")\n", + " print(\"Пропущенные значения:\\n\", dataset.isnull().sum())\n", + " print(\"Выбросы и статистика:\\n\", dataset.describe())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- Students ---\n", + "Пропущенные значения:\n", + " gender 0\n", + "race/ethnicity 0\n", + "parental level of education 0\n", + "lunch 0\n", + "test preparation course 0\n", + "math score 0\n", + "reading score 0\n", + "writing score 0\n", + "dtype: int64\n", + "Выбросы и статистика:\n", + " math score reading score writing score\n", + "count 1000.00000 1000.000000 1000.000000\n", + "mean 66.08900 69.169000 68.054000\n", + "std 15.16308 14.600192 15.195657\n", + "min 0.00000 17.000000 10.000000\n", + "25% 57.00000 59.000000 57.750000\n", + "50% 66.00000 70.000000 69.000000\n", + "75% 77.00000 79.000000 79.000000\n", + "max 100.00000 100.000000 100.000000\n", + "--- Diamonds ---\n", + "Пропущенные значения:\n", + " Unnamed: 0 0\n", + "carat 0\n", + "cut 0\n", + "color 0\n", + "clarity 0\n", + "depth 0\n", + "table 0\n", + "price 0\n", + "x 0\n", + "y 0\n", + "z 0\n", + "dtype: int64\n", + "Выбросы и статистика:\n", + " Unnamed: 0 carat depth table price \\\n", + "count 53943.000000 53943.000000 53943.000000 53943.000000 53943.000000 \n", + "mean 26972.000000 0.797935 61.749322 57.457251 3932.734294 \n", + "std 15572.147122 0.473999 1.432626 2.234549 3989.338447 \n", + "min 1.000000 0.200000 43.000000 43.000000 326.000000 \n", + "25% 13486.500000 0.400000 61.000000 56.000000 950.000000 \n", + "50% 26972.000000 0.700000 61.800000 57.000000 2401.000000 \n", + "75% 40457.500000 1.040000 62.500000 59.000000 5324.000000 \n", + "max 53943.000000 5.010000 79.000000 95.000000 18823.000000 \n", + "\n", + " x y z \n", + "count 53943.000000 53943.000000 53943.000000 \n", + "mean 5.731158 5.734526 3.538730 \n", + "std 1.121730 1.142103 0.705679 \n", + "min 0.000000 0.000000 0.000000 \n", + "25% 4.710000 4.720000 2.910000 \n", + "50% 5.700000 5.710000 3.530000 \n", + "75% 6.540000 6.540000 4.040000 \n", + "max 10.740000 58.900000 31.800000 \n", + "--- Stroke ---\n", + "Пропущенные значения:\n", + " id 0\n", + "gender 0\n", + "age 0\n", + "hypertension 0\n", + "heart_disease 0\n", + "ever_married 0\n", + "work_type 0\n", + "Residence_type 0\n", + "avg_glucose_level 0\n", + "bmi 201\n", + "smoking_status 0\n", + "stroke 0\n", + "dtype: int64\n", + "Выбросы и статистика:\n", + " id age hypertension heart_disease \\\n", + "count 5110.000000 5110.000000 5110.000000 5110.000000 \n", + "mean 36517.829354 43.226614 0.097456 0.054012 \n", + "std 21161.721625 22.612647 0.296607 0.226063 \n", + "min 67.000000 0.080000 0.000000 0.000000 \n", + "25% 17741.250000 25.000000 0.000000 0.000000 \n", + "50% 36932.000000 45.000000 0.000000 0.000000 \n", + "75% 54682.000000 61.000000 0.000000 0.000000 \n", + "max 72940.000000 82.000000 1.000000 1.000000 \n", + "\n", + " avg_glucose_level bmi stroke \n", + "count 5110.000000 4909.000000 5110.000000 \n", + "mean 106.147677 28.893237 0.048728 \n", + "std 45.283560 7.854067 0.215320 \n", + "min 55.120000 10.300000 0.000000 \n", + "25% 77.245000 23.500000 0.000000 \n", + "50% 91.885000 28.100000 0.000000 \n", + "75% 114.090000 33.100000 0.000000 \n", + "max 271.740000 97.600000 1.000000 \n" + ] + } + ], + "source": [ + "# Анализ выбросов и пропущенных данных\n", + "for name, dataset in {\"Students\": students_data, \"Diamonds\": diamonds_data, \"Stroke\": stroke_data}.items():\n", + " print(f\"--- {name} ---\")\n", + " print(\"Пропущенные значения:\\n\", dataset.isnull().sum())\n", + " print(\"Выбросы и статистика:\\n\", dataset.describe())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "students_data = students_data.dropna()\n", + "diamonds_data = diamonds_data.dropna()\n", + "stroke_data = stroke_data.dropna()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Данные в целом довольно качественные, без значителньых выбросов или пропущенных значений. Информативность находится на достаточном уровне, датасеты пригодны для дальнейшего анализа" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- Students ---\n", + "Статистика после очистки:\n", + " math score reading score writing score\n", + "count 1000.00000 1000.000000 1000.000000\n", + "mean 66.08900 69.169000 68.054000\n", + "std 15.16308 14.600192 15.195657\n", + "min 0.00000 17.000000 10.000000\n", + "25% 57.00000 59.000000 57.750000\n", + "50% 66.00000 70.000000 69.000000\n", + "75% 77.00000 79.000000 79.000000\n", + "max 100.00000 100.000000 100.000000\n", + "--- Diamonds ---\n", + "Статистика после очистки:\n", + " Unnamed: 0 carat depth table price \\\n", + "count 53943.000000 53943.000000 53943.000000 53943.000000 53943.000000 \n", + "mean 26972.000000 0.797935 61.749322 57.457251 3932.734294 \n", + "std 15572.147122 0.473999 1.432626 2.234549 3989.338447 \n", + "min 1.000000 0.200000 43.000000 43.000000 326.000000 \n", + "25% 13486.500000 0.400000 61.000000 56.000000 950.000000 \n", + "50% 26972.000000 0.700000 61.800000 57.000000 2401.000000 \n", + "75% 40457.500000 1.040000 62.500000 59.000000 5324.000000 \n", + "max 53943.000000 5.010000 79.000000 95.000000 18823.000000 \n", + "\n", + " x y z \n", + "count 53943.000000 53943.000000 53943.000000 \n", + "mean 5.731158 5.734526 3.538730 \n", + "std 1.121730 1.142103 0.705679 \n", + "min 0.000000 0.000000 0.000000 \n", + "25% 4.710000 4.720000 2.910000 \n", + "50% 5.700000 5.710000 3.530000 \n", + "75% 6.540000 6.540000 4.040000 \n", + "max 10.740000 58.900000 31.800000 \n", + "--- Stroke ---\n", + "Статистика после очистки:\n", + " id age hypertension heart_disease \\\n", + "count 4909.000000 4909.000000 4909.000000 4909.000000 \n", + "mean 37064.313506 42.865374 0.091872 0.049501 \n", + "std 20995.098457 22.555115 0.288875 0.216934 \n", + "min 77.000000 0.080000 0.000000 0.000000 \n", + "25% 18605.000000 25.000000 0.000000 0.000000 \n", + "50% 37608.000000 44.000000 0.000000 0.000000 \n", + "75% 55220.000000 60.000000 0.000000 0.000000 \n", + "max 72940.000000 82.000000 1.000000 1.000000 \n", + "\n", + " avg_glucose_level bmi stroke \n", + "count 4909.000000 4909.000000 4909.000000 \n", + "mean 105.305150 28.893237 0.042575 \n", + "std 44.424341 7.854067 0.201917 \n", + "min 55.120000 10.300000 0.000000 \n", + "25% 77.070000 23.500000 0.000000 \n", + "50% 91.680000 28.100000 0.000000 \n", + "75% 113.570000 33.100000 0.000000 \n", + "max 271.740000 97.600000 1.000000 \n" + ] + } + ], + "source": [ + "for name, dataset in {\"Students\": students_data, \"Diamonds\": diamonds_data, \"Stroke\": stroke_data}.items():\n", + " print(f\"--- {name} ---\")\n", + " print(\"Статистика после очистки:\\n\", dataset.describe())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "students_train, students_test = train_test_split(students_data, test_size=0.2, stratify=students_data['gender'])\n", + "diamonds_train, diamonds_test = train_test_split(diamonds_data, test_size=0.2, stratify=diamonds_data['cut'])\n", + "stroke_train, stroke_test = train_test_split(stroke_data, test_size=0.2, stratify=stroke_data['stroke'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from imblearn.over_sampling import RandomOverSampler\n", + "\n", + "# Балансировка данных\n", + "oversampler = RandomOverSampler(random_state=42)\n", + "X_train, y_train = oversampler.fit_resample(stroke_train.drop(columns=['stroke']), stroke_train['stroke'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from imblearn.under_sampling import RandomUnderSampler\n", + "\n", + "# Пример oversampling и undersampling\n", + "oversampler = RandomOverSampler(random_state=42)\n", + "undersampler = RandomUnderSampler(random_state=42)\n", + "\n", + "X_over, y_over = oversampler.fit_resample(X_train, y_train)\n", + "X_under, y_under = undersampler.fit_resample(X_train, y_train)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}