lab_2 #8
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,2 +1,3 @@
|
||||
data/jio_mart_items.csv
|
||||
/data
|
||||
/Lab_2/lab_2.ipynb
|
||||
|
609
Lab_2/lab2.ipynb
Normal file
609
Lab_2/lab2.ipynb
Normal file
@ -0,0 +1,609 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Шаг 1. Анализ сведений о каждом наборе данных со страницы Kaggle\n",
|
||||
"### 1.1 Оценки студентов на экзаменах:\n",
|
||||
"Тематика: Анализ образовательной успеваемости.\n",
|
||||
"Объекты наблюдения: Студенты.\n",
|
||||
"Атрибуты: Пол, этническая принадлежность, уровень образования родителей, подготовка к тесту, оценки по математике, чтению и письму.\n",
|
||||
"Цель: Понять факторы, влияющие на академическую успеваемость.\n",
|
||||
"### 1.2 Цены на бриллианты:\n",
|
||||
"Тематика: Рынок драгоценных камней.\n",
|
||||
"Объекты наблюдения: Бриллианты.\n",
|
||||
"Атрибуты: Вес (карат), цвет, чистота, качество огранки, цена.\n",
|
||||
"Цель: Предсказать цену на основе характеристик.\n",
|
||||
"### 1.3 Данные по инсультам:\n",
|
||||
"Тематика: Здоровье.\n",
|
||||
"Объекты наблюдения: Пациенты.\n",
|
||||
"Атрибуты: Пол, возраст, наличие хронических заболеваний, привычки (например, курение), наличие инсульта.\n",
|
||||
"Цель: Предсказать вероятность инсульта."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<>:3: SyntaxWarning: invalid escape sequence '\\S'\n",
|
||||
"<>:4: SyntaxWarning: invalid escape sequence '\\D'\n",
|
||||
"<>:5: SyntaxWarning: invalid escape sequence '\\h'\n",
|
||||
"<>:3: SyntaxWarning: invalid escape sequence '\\S'\n",
|
||||
"<>:4: SyntaxWarning: invalid escape sequence '\\D'\n",
|
||||
"<>:5: SyntaxWarning: invalid escape sequence '\\h'\n",
|
||||
"C:\\Users\\MaD\\AppData\\Local\\Temp\\ipykernel_7684\\1678678831.py:3: SyntaxWarning: invalid escape sequence '\\S'\n",
|
||||
" students_data = pd.read_csv(\"../data\\StudentsPerformance.csv\")\n",
|
||||
"C:\\Users\\MaD\\AppData\\Local\\Temp\\ipykernel_7684\\1678678831.py:4: SyntaxWarning: invalid escape sequence '\\D'\n",
|
||||
" diamonds_data = pd.read_csv(\"../data\\Diamonds Prices2022.csv\")\n",
|
||||
"C:\\Users\\MaD\\AppData\\Local\\Temp\\ipykernel_7684\\1678678831.py:5: SyntaxWarning: invalid escape sequence '\\h'\n",
|
||||
" stroke_data = pd.read_csv(\"../data\\healthcare-dataset-stroke-data.csv\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"--- Students ---\n",
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"RangeIndex: 1000 entries, 0 to 999\n",
|
||||
"Data columns (total 8 columns):\n",
|
||||
" # Column Non-Null Count Dtype \n",
|
||||
"--- ------ -------------- ----- \n",
|
||||
" 0 gender 1000 non-null object\n",
|
||||
" 1 race/ethnicity 1000 non-null object\n",
|
||||
" 2 parental level of education 1000 non-null object\n",
|
||||
" 3 lunch 1000 non-null object\n",
|
||||
" 4 test preparation course 1000 non-null object\n",
|
||||
" 5 math score 1000 non-null int64 \n",
|
||||
" 6 reading score 1000 non-null int64 \n",
|
||||
" 7 writing score 1000 non-null int64 \n",
|
||||
"dtypes: int64(3), object(5)\n",
|
||||
"memory usage: 62.6+ KB\n",
|
||||
"None\n",
|
||||
"gender 0\n",
|
||||
"race/ethnicity 0\n",
|
||||
"parental level of education 0\n",
|
||||
"lunch 0\n",
|
||||
"test preparation course 0\n",
|
||||
"math score 0\n",
|
||||
"reading score 0\n",
|
||||
"writing score 0\n",
|
||||
"dtype: int64\n",
|
||||
" math score reading score writing score\n",
|
||||
"count 1000.00000 1000.000000 1000.000000\n",
|
||||
"mean 66.08900 69.169000 68.054000\n",
|
||||
"std 15.16308 14.600192 15.195657\n",
|
||||
"min 0.00000 17.000000 10.000000\n",
|
||||
"25% 57.00000 59.000000 57.750000\n",
|
||||
"50% 66.00000 70.000000 69.000000\n",
|
||||
"75% 77.00000 79.000000 79.000000\n",
|
||||
"max 100.00000 100.000000 100.000000\n",
|
||||
"--- Diamonds ---\n",
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"RangeIndex: 53943 entries, 0 to 53942\n",
|
||||
"Data columns (total 11 columns):\n",
|
||||
" # Column Non-Null Count Dtype \n",
|
||||
"--- ------ -------------- ----- \n",
|
||||
" 0 Unnamed: 0 53943 non-null int64 \n",
|
||||
" 1 carat 53943 non-null float64\n",
|
||||
" 2 cut 53943 non-null object \n",
|
||||
" 3 color 53943 non-null object \n",
|
||||
" 4 clarity 53943 non-null object \n",
|
||||
" 5 depth 53943 non-null float64\n",
|
||||
" 6 table 53943 non-null float64\n",
|
||||
" 7 price 53943 non-null int64 \n",
|
||||
" 8 x 53943 non-null float64\n",
|
||||
" 9 y 53943 non-null float64\n",
|
||||
" 10 z 53943 non-null float64\n",
|
||||
"dtypes: float64(6), int64(2), object(3)\n",
|
||||
"memory usage: 4.5+ MB\n",
|
||||
"None\n",
|
||||
"Unnamed: 0 0\n",
|
||||
"carat 0\n",
|
||||
"cut 0\n",
|
||||
"color 0\n",
|
||||
"clarity 0\n",
|
||||
"depth 0\n",
|
||||
"table 0\n",
|
||||
"price 0\n",
|
||||
"x 0\n",
|
||||
"y 0\n",
|
||||
"z 0\n",
|
||||
"dtype: int64\n",
|
||||
" Unnamed: 0 carat depth table price \\\n",
|
||||
"count 53943.000000 53943.000000 53943.000000 53943.000000 53943.000000 \n",
|
||||
"mean 26972.000000 0.797935 61.749322 57.457251 3932.734294 \n",
|
||||
"std 15572.147122 0.473999 1.432626 2.234549 3989.338447 \n",
|
||||
"min 1.000000 0.200000 43.000000 43.000000 326.000000 \n",
|
||||
"25% 13486.500000 0.400000 61.000000 56.000000 950.000000 \n",
|
||||
"50% 26972.000000 0.700000 61.800000 57.000000 2401.000000 \n",
|
||||
"75% 40457.500000 1.040000 62.500000 59.000000 5324.000000 \n",
|
||||
"max 53943.000000 5.010000 79.000000 95.000000 18823.000000 \n",
|
||||
"\n",
|
||||
" x y z \n",
|
||||
"count 53943.000000 53943.000000 53943.000000 \n",
|
||||
"mean 5.731158 5.734526 3.538730 \n",
|
||||
"std 1.121730 1.142103 0.705679 \n",
|
||||
"min 0.000000 0.000000 0.000000 \n",
|
||||
"25% 4.710000 4.720000 2.910000 \n",
|
||||
"50% 5.700000 5.710000 3.530000 \n",
|
||||
"75% 6.540000 6.540000 4.040000 \n",
|
||||
"max 10.740000 58.900000 31.800000 \n",
|
||||
"--- Stroke ---\n",
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"RangeIndex: 5110 entries, 0 to 5109\n",
|
||||
"Data columns (total 12 columns):\n",
|
||||
" # Column Non-Null Count Dtype \n",
|
||||
"--- ------ -------------- ----- \n",
|
||||
" 0 id 5110 non-null int64 \n",
|
||||
" 1 gender 5110 non-null object \n",
|
||||
" 2 age 5110 non-null float64\n",
|
||||
" 3 hypertension 5110 non-null int64 \n",
|
||||
" 4 heart_disease 5110 non-null int64 \n",
|
||||
" 5 ever_married 5110 non-null object \n",
|
||||
" 6 work_type 5110 non-null object \n",
|
||||
" 7 Residence_type 5110 non-null object \n",
|
||||
" 8 avg_glucose_level 5110 non-null float64\n",
|
||||
" 9 bmi 4909 non-null float64\n",
|
||||
" 10 smoking_status 5110 non-null object \n",
|
||||
" 11 stroke 5110 non-null int64 \n",
|
||||
"dtypes: float64(3), int64(4), object(5)\n",
|
||||
"memory usage: 479.2+ KB\n",
|
||||
"None\n",
|
||||
"id 0\n",
|
||||
"gender 0\n",
|
||||
"age 0\n",
|
||||
"hypertension 0\n",
|
||||
"heart_disease 0\n",
|
||||
"ever_married 0\n",
|
||||
"work_type 0\n",
|
||||
"Residence_type 0\n",
|
||||
"avg_glucose_level 0\n",
|
||||
"bmi 201\n",
|
||||
"smoking_status 0\n",
|
||||
"stroke 0\n",
|
||||
"dtype: int64\n",
|
||||
" id age hypertension heart_disease \\\n",
|
||||
"count 5110.000000 5110.000000 5110.000000 5110.000000 \n",
|
||||
"mean 36517.829354 43.226614 0.097456 0.054012 \n",
|
||||
"std 21161.721625 22.612647 0.296607 0.226063 \n",
|
||||
"min 67.000000 0.080000 0.000000 0.000000 \n",
|
||||
"25% 17741.250000 25.000000 0.000000 0.000000 \n",
|
||||
"50% 36932.000000 45.000000 0.000000 0.000000 \n",
|
||||
"75% 54682.000000 61.000000 0.000000 0.000000 \n",
|
||||
"max 72940.000000 82.000000 1.000000 1.000000 \n",
|
||||
"\n",
|
||||
" avg_glucose_level bmi stroke \n",
|
||||
"count 5110.000000 4909.000000 5110.000000 \n",
|
||||
"mean 106.147677 28.893237 0.048728 \n",
|
||||
"std 45.283560 7.854067 0.215320 \n",
|
||||
"min 55.120000 10.300000 0.000000 \n",
|
||||
"25% 77.245000 23.500000 0.000000 \n",
|
||||
"50% 91.885000 28.100000 0.000000 \n",
|
||||
"75% 114.090000 33.100000 0.000000 \n",
|
||||
"max 271.740000 97.600000 1.000000 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"# Загрузка датасетов\n",
|
||||
"students_data = pd.read_csv(\"../data\\StudentsPerformance.csv\")\n",
|
||||
"diamonds_data = pd.read_csv(\"../data\\Diamonds Prices2022.csv\")\n",
|
||||
"stroke_data = pd.read_csv(\"../data\\healthcare-dataset-stroke-data.csv\")\n",
|
||||
"\n",
|
||||
"# Информация о наборах данных\n",
|
||||
"for name, dataset in {\"Students\": students_data, \"Diamonds\": diamonds_data, \"Stroke\": stroke_data}.items():\n",
|
||||
" print(f\"--- {name} ---\")\n",
|
||||
" print(dataset.info())\n",
|
||||
" print(dataset.isnull().sum())\n",
|
||||
" print(dataset.describe())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Шаг 3. Примеры бизнес-целей\n",
|
||||
"Оценки студентов: Рекомендации по обучению для повышения успеваемости.\n",
|
||||
"Цены на бриллианты: Оценка стоимости для продаж.\n",
|
||||
"Данные по инсультам: повышения эффективности здравоохранения"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Шаг 4. Цели технического проекта\n",
|
||||
"Примеры:\n",
|
||||
"Оценки студентов:\n",
|
||||
"Вход: Данные о студентах.\n",
|
||||
"Целевой признак: Оценки.\n",
|
||||
"Цены на бриллианты:\n",
|
||||
"Вход: Характеристики бриллиантов.\n",
|
||||
"Целевой признак: Цена.\n",
|
||||
"Данные по инсультам:\n",
|
||||
"Вход: Данные о пациентах.\n",
|
||||
"Целевой признак: Наличие инсульта."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"--- Students ---\n",
|
||||
"Пропущенные значения:\n",
|
||||
" gender 0\n",
|
||||
"race/ethnicity 0\n",
|
||||
"parental level of education 0\n",
|
||||
"lunch 0\n",
|
||||
"test preparation course 0\n",
|
||||
"math score 0\n",
|
||||
"reading score 0\n",
|
||||
"writing score 0\n",
|
||||
"dtype: int64\n",
|
||||
"Выбросы и статистика:\n",
|
||||
" math score reading score writing score\n",
|
||||
"count 1000.00000 1000.000000 1000.000000\n",
|
||||
"mean 66.08900 69.169000 68.054000\n",
|
||||
"std 15.16308 14.600192 15.195657\n",
|
||||
"min 0.00000 17.000000 10.000000\n",
|
||||
"25% 57.00000 59.000000 57.750000\n",
|
||||
"50% 66.00000 70.000000 69.000000\n",
|
||||
"75% 77.00000 79.000000 79.000000\n",
|
||||
"max 100.00000 100.000000 100.000000\n",
|
||||
"--- Diamonds ---\n",
|
||||
"Пропущенные значения:\n",
|
||||
" Unnamed: 0 0\n",
|
||||
"carat 0\n",
|
||||
"cut 0\n",
|
||||
"color 0\n",
|
||||
"clarity 0\n",
|
||||
"depth 0\n",
|
||||
"table 0\n",
|
||||
"price 0\n",
|
||||
"x 0\n",
|
||||
"y 0\n",
|
||||
"z 0\n",
|
||||
"dtype: int64\n",
|
||||
"Выбросы и статистика:\n",
|
||||
" Unnamed: 0 carat depth table price \\\n",
|
||||
"count 53943.000000 53943.000000 53943.000000 53943.000000 53943.000000 \n",
|
||||
"mean 26972.000000 0.797935 61.749322 57.457251 3932.734294 \n",
|
||||
"std 15572.147122 0.473999 1.432626 2.234549 3989.338447 \n",
|
||||
"min 1.000000 0.200000 43.000000 43.000000 326.000000 \n",
|
||||
"25% 13486.500000 0.400000 61.000000 56.000000 950.000000 \n",
|
||||
"50% 26972.000000 0.700000 61.800000 57.000000 2401.000000 \n",
|
||||
"75% 40457.500000 1.040000 62.500000 59.000000 5324.000000 \n",
|
||||
"max 53943.000000 5.010000 79.000000 95.000000 18823.000000 \n",
|
||||
"\n",
|
||||
" x y z \n",
|
||||
"count 53943.000000 53943.000000 53943.000000 \n",
|
||||
"mean 5.731158 5.734526 3.538730 \n",
|
||||
"std 1.121730 1.142103 0.705679 \n",
|
||||
"min 0.000000 0.000000 0.000000 \n",
|
||||
"25% 4.710000 4.720000 2.910000 \n",
|
||||
"50% 5.700000 5.710000 3.530000 \n",
|
||||
"75% 6.540000 6.540000 4.040000 \n",
|
||||
"max 10.740000 58.900000 31.800000 \n",
|
||||
"--- Stroke ---\n",
|
||||
"Пропущенные значения:\n",
|
||||
" id 0\n",
|
||||
"gender 0\n",
|
||||
"age 0\n",
|
||||
"hypertension 0\n",
|
||||
"heart_disease 0\n",
|
||||
"ever_married 0\n",
|
||||
"work_type 0\n",
|
||||
"Residence_type 0\n",
|
||||
"avg_glucose_level 0\n",
|
||||
"bmi 201\n",
|
||||
"smoking_status 0\n",
|
||||
"stroke 0\n",
|
||||
"dtype: int64\n",
|
||||
"Выбросы и статистика:\n",
|
||||
" id age hypertension heart_disease \\\n",
|
||||
"count 5110.000000 5110.000000 5110.000000 5110.000000 \n",
|
||||
"mean 36517.829354 43.226614 0.097456 0.054012 \n",
|
||||
"std 21161.721625 22.612647 0.296607 0.226063 \n",
|
||||
"min 67.000000 0.080000 0.000000 0.000000 \n",
|
||||
"25% 17741.250000 25.000000 0.000000 0.000000 \n",
|
||||
"50% 36932.000000 45.000000 0.000000 0.000000 \n",
|
||||
"75% 54682.000000 61.000000 0.000000 0.000000 \n",
|
||||
"max 72940.000000 82.000000 1.000000 1.000000 \n",
|
||||
"\n",
|
||||
" avg_glucose_level bmi stroke \n",
|
||||
"count 5110.000000 4909.000000 5110.000000 \n",
|
||||
"mean 106.147677 28.893237 0.048728 \n",
|
||||
"std 45.283560 7.854067 0.215320 \n",
|
||||
"min 55.120000 10.300000 0.000000 \n",
|
||||
"25% 77.245000 23.500000 0.000000 \n",
|
||||
"50% 91.885000 28.100000 0.000000 \n",
|
||||
"75% 114.090000 33.100000 0.000000 \n",
|
||||
"max 271.740000 97.600000 1.000000 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Анализ выбросов и пропущенных данных\n",
|
||||
"for name, dataset in {\"Students\": students_data, \"Diamonds\": diamonds_data, \"Stroke\": stroke_data}.items():\n",
|
||||
" print(f\"--- {name} ---\")\n",
|
||||
" print(\"Пропущенные значения:\\n\", dataset.isnull().sum())\n",
|
||||
" print(\"Выбросы и статистика:\\n\", dataset.describe())\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"--- Students ---\n",
|
||||
"Пропущенные значения:\n",
|
||||
" gender 0\n",
|
||||
"race/ethnicity 0\n",
|
||||
"parental level of education 0\n",
|
||||
"lunch 0\n",
|
||||
"test preparation course 0\n",
|
||||
"math score 0\n",
|
||||
"reading score 0\n",
|
||||
"writing score 0\n",
|
||||
"dtype: int64\n",
|
||||
"Выбросы и статистика:\n",
|
||||
" math score reading score writing score\n",
|
||||
"count 1000.00000 1000.000000 1000.000000\n",
|
||||
"mean 66.08900 69.169000 68.054000\n",
|
||||
"std 15.16308 14.600192 15.195657\n",
|
||||
"min 0.00000 17.000000 10.000000\n",
|
||||
"25% 57.00000 59.000000 57.750000\n",
|
||||
"50% 66.00000 70.000000 69.000000\n",
|
||||
"75% 77.00000 79.000000 79.000000\n",
|
||||
"max 100.00000 100.000000 100.000000\n",
|
||||
"--- Diamonds ---\n",
|
||||
"Пропущенные значения:\n",
|
||||
" Unnamed: 0 0\n",
|
||||
"carat 0\n",
|
||||
"cut 0\n",
|
||||
"color 0\n",
|
||||
"clarity 0\n",
|
||||
"depth 0\n",
|
||||
"table 0\n",
|
||||
"price 0\n",
|
||||
"x 0\n",
|
||||
"y 0\n",
|
||||
"z 0\n",
|
||||
"dtype: int64\n",
|
||||
"Выбросы и статистика:\n",
|
||||
" Unnamed: 0 carat depth table price \\\n",
|
||||
"count 53943.000000 53943.000000 53943.000000 53943.000000 53943.000000 \n",
|
||||
"mean 26972.000000 0.797935 61.749322 57.457251 3932.734294 \n",
|
||||
"std 15572.147122 0.473999 1.432626 2.234549 3989.338447 \n",
|
||||
"min 1.000000 0.200000 43.000000 43.000000 326.000000 \n",
|
||||
"25% 13486.500000 0.400000 61.000000 56.000000 950.000000 \n",
|
||||
"50% 26972.000000 0.700000 61.800000 57.000000 2401.000000 \n",
|
||||
"75% 40457.500000 1.040000 62.500000 59.000000 5324.000000 \n",
|
||||
"max 53943.000000 5.010000 79.000000 95.000000 18823.000000 \n",
|
||||
"\n",
|
||||
" x y z \n",
|
||||
"count 53943.000000 53943.000000 53943.000000 \n",
|
||||
"mean 5.731158 5.734526 3.538730 \n",
|
||||
"std 1.121730 1.142103 0.705679 \n",
|
||||
"min 0.000000 0.000000 0.000000 \n",
|
||||
"25% 4.710000 4.720000 2.910000 \n",
|
||||
"50% 5.700000 5.710000 3.530000 \n",
|
||||
"75% 6.540000 6.540000 4.040000 \n",
|
||||
"max 10.740000 58.900000 31.800000 \n",
|
||||
"--- Stroke ---\n",
|
||||
"Пропущенные значения:\n",
|
||||
" id 0\n",
|
||||
"gender 0\n",
|
||||
"age 0\n",
|
||||
"hypertension 0\n",
|
||||
"heart_disease 0\n",
|
||||
"ever_married 0\n",
|
||||
"work_type 0\n",
|
||||
"Residence_type 0\n",
|
||||
"avg_glucose_level 0\n",
|
||||
"bmi 201\n",
|
||||
"smoking_status 0\n",
|
||||
"stroke 0\n",
|
||||
"dtype: int64\n",
|
||||
"Выбросы и статистика:\n",
|
||||
" id age hypertension heart_disease \\\n",
|
||||
"count 5110.000000 5110.000000 5110.000000 5110.000000 \n",
|
||||
"mean 36517.829354 43.226614 0.097456 0.054012 \n",
|
||||
"std 21161.721625 22.612647 0.296607 0.226063 \n",
|
||||
"min 67.000000 0.080000 0.000000 0.000000 \n",
|
||||
"25% 17741.250000 25.000000 0.000000 0.000000 \n",
|
||||
"50% 36932.000000 45.000000 0.000000 0.000000 \n",
|
||||
"75% 54682.000000 61.000000 0.000000 0.000000 \n",
|
||||
"max 72940.000000 82.000000 1.000000 1.000000 \n",
|
||||
"\n",
|
||||
" avg_glucose_level bmi stroke \n",
|
||||
"count 5110.000000 4909.000000 5110.000000 \n",
|
||||
"mean 106.147677 28.893237 0.048728 \n",
|
||||
"std 45.283560 7.854067 0.215320 \n",
|
||||
"min 55.120000 10.300000 0.000000 \n",
|
||||
"25% 77.245000 23.500000 0.000000 \n",
|
||||
"50% 91.885000 28.100000 0.000000 \n",
|
||||
"75% 114.090000 33.100000 0.000000 \n",
|
||||
"max 271.740000 97.600000 1.000000 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Анализ выбросов и пропущенных данных\n",
|
||||
"for name, dataset in {\"Students\": students_data, \"Diamonds\": diamonds_data, \"Stroke\": stroke_data}.items():\n",
|
||||
" print(f\"--- {name} ---\")\n",
|
||||
" print(\"Пропущенные значения:\\n\", dataset.isnull().sum())\n",
|
||||
" print(\"Выбросы и статистика:\\n\", dataset.describe())\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"students_data = students_data.dropna()\n",
|
||||
"diamonds_data = diamonds_data.dropna()\n",
|
||||
"stroke_data = stroke_data.dropna()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Данные в целом довольно качественные, без значителньых выбросов или пропущенных значений. Информативность находится на достаточном уровне, датасеты пригодны для дальнейшего анализа"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"--- Students ---\n",
|
||||
"Статистика после очистки:\n",
|
||||
" math score reading score writing score\n",
|
||||
"count 1000.00000 1000.000000 1000.000000\n",
|
||||
"mean 66.08900 69.169000 68.054000\n",
|
||||
"std 15.16308 14.600192 15.195657\n",
|
||||
"min 0.00000 17.000000 10.000000\n",
|
||||
"25% 57.00000 59.000000 57.750000\n",
|
||||
"50% 66.00000 70.000000 69.000000\n",
|
||||
"75% 77.00000 79.000000 79.000000\n",
|
||||
"max 100.00000 100.000000 100.000000\n",
|
||||
"--- Diamonds ---\n",
|
||||
"Статистика после очистки:\n",
|
||||
" Unnamed: 0 carat depth table price \\\n",
|
||||
"count 53943.000000 53943.000000 53943.000000 53943.000000 53943.000000 \n",
|
||||
"mean 26972.000000 0.797935 61.749322 57.457251 3932.734294 \n",
|
||||
"std 15572.147122 0.473999 1.432626 2.234549 3989.338447 \n",
|
||||
"min 1.000000 0.200000 43.000000 43.000000 326.000000 \n",
|
||||
"25% 13486.500000 0.400000 61.000000 56.000000 950.000000 \n",
|
||||
"50% 26972.000000 0.700000 61.800000 57.000000 2401.000000 \n",
|
||||
"75% 40457.500000 1.040000 62.500000 59.000000 5324.000000 \n",
|
||||
"max 53943.000000 5.010000 79.000000 95.000000 18823.000000 \n",
|
||||
"\n",
|
||||
" x y z \n",
|
||||
"count 53943.000000 53943.000000 53943.000000 \n",
|
||||
"mean 5.731158 5.734526 3.538730 \n",
|
||||
"std 1.121730 1.142103 0.705679 \n",
|
||||
"min 0.000000 0.000000 0.000000 \n",
|
||||
"25% 4.710000 4.720000 2.910000 \n",
|
||||
"50% 5.700000 5.710000 3.530000 \n",
|
||||
"75% 6.540000 6.540000 4.040000 \n",
|
||||
"max 10.740000 58.900000 31.800000 \n",
|
||||
"--- Stroke ---\n",
|
||||
"Статистика после очистки:\n",
|
||||
" id age hypertension heart_disease \\\n",
|
||||
"count 4909.000000 4909.000000 4909.000000 4909.000000 \n",
|
||||
"mean 37064.313506 42.865374 0.091872 0.049501 \n",
|
||||
"std 20995.098457 22.555115 0.288875 0.216934 \n",
|
||||
"min 77.000000 0.080000 0.000000 0.000000 \n",
|
||||
"25% 18605.000000 25.000000 0.000000 0.000000 \n",
|
||||
"50% 37608.000000 44.000000 0.000000 0.000000 \n",
|
||||
"75% 55220.000000 60.000000 0.000000 0.000000 \n",
|
||||
"max 72940.000000 82.000000 1.000000 1.000000 \n",
|
||||
"\n",
|
||||
" avg_glucose_level bmi stroke \n",
|
||||
"count 4909.000000 4909.000000 4909.000000 \n",
|
||||
"mean 105.305150 28.893237 0.042575 \n",
|
||||
"std 44.424341 7.854067 0.201917 \n",
|
||||
"min 55.120000 10.300000 0.000000 \n",
|
||||
"25% 77.070000 23.500000 0.000000 \n",
|
||||
"50% 91.680000 28.100000 0.000000 \n",
|
||||
"75% 113.570000 33.100000 0.000000 \n",
|
||||
"max 271.740000 97.600000 1.000000 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for name, dataset in {\"Students\": students_data, \"Diamonds\": diamonds_data, \"Stroke\": stroke_data}.items():\n",
|
||||
" print(f\"--- {name} ---\")\n",
|
||||
" print(\"Статистика после очистки:\\n\", dataset.describe())\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"students_train, students_test = train_test_split(students_data, test_size=0.2, stratify=students_data['gender'])\n",
|
||||
"diamonds_train, diamonds_test = train_test_split(diamonds_data, test_size=0.2, stratify=diamonds_data['cut'])\n",
|
||||
"stroke_train, stroke_test = train_test_split(stroke_data, test_size=0.2, stratify=stroke_data['stroke'])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from imblearn.over_sampling import RandomOverSampler\n",
|
||||
"\n",
|
||||
"# Балансировка данных\n",
|
||||
"oversampler = RandomOverSampler(random_state=42)\n",
|
||||
"X_train, y_train = oversampler.fit_resample(stroke_train.drop(columns=['stroke']), stroke_train['stroke'])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from imblearn.under_sampling import RandomUnderSampler\n",
|
||||
"\n",
|
||||
"# Пример oversampling и undersampling\n",
|
||||
"oversampler = RandomOverSampler(random_state=42)\n",
|
||||
"undersampler = RandomUnderSampler(random_state=42)\n",
|
||||
"\n",
|
||||
"X_over, y_over = oversampler.fit_resample(X_train, y_train)\n",
|
||||
"X_under, y_under = undersampler.fit_resample(X_train, y_train)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Loading…
Reference in New Issue
Block a user