Compare commits
9 Commits
Author | SHA1 | Date | |
---|---|---|---|
f384e3f572 | |||
a2a177418f | |||
fe8f73f4ed | |||
d8d984b974 | |||
|
bfd498e75d | ||
|
a4d1755f09 | ||
|
ecf86a9738 | ||
|
f28256ade8 | ||
8435d77782 |
609
Lab_2/lab2.ipynb
609
Lab_2/lab2.ipynb
@ -0,0 +1,609 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Шаг 1. Анализ сведений о каждом наборе данных со страницы Kaggle\n",
|
||||
"### 1.1 Оценки студентов на экзаменах:\n",
|
||||
"Тематика: Анализ образовательной успеваемости.\n",
|
||||
"Объекты наблюдения: Студенты.\n",
|
||||
"Атрибуты: Пол, этническая принадлежность, уровень образования родителей, подготовка к тесту, оценки по математике, чтению и письму.\n",
|
||||
"Цель: Понять факторы, влияющие на академическую успеваемость.\n",
|
||||
"### 1.2 Цены на бриллианты:\n",
|
||||
"Тематика: Рынок драгоценных камней.\n",
|
||||
"Объекты наблюдения: Бриллианты.\n",
|
||||
"Атрибуты: Вес (карат), цвет, чистота, качество огранки, цена.\n",
|
||||
"Цель: Предсказать цену на основе характеристик.\n",
|
||||
"### 1.3 Данные по инсультам:\n",
|
||||
"Тематика: Здоровье.\n",
|
||||
"Объекты наблюдения: Пациенты.\n",
|
||||
"Атрибуты: Пол, возраст, наличие хронических заболеваний, привычки (например, курение), наличие инсульта.\n",
|
||||
"Цель: Предсказать вероятность инсульта."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<>:3: SyntaxWarning: invalid escape sequence '\\S'\n",
|
||||
"<>:4: SyntaxWarning: invalid escape sequence '\\D'\n",
|
||||
"<>:5: SyntaxWarning: invalid escape sequence '\\h'\n",
|
||||
"<>:3: SyntaxWarning: invalid escape sequence '\\S'\n",
|
||||
"<>:4: SyntaxWarning: invalid escape sequence '\\D'\n",
|
||||
"<>:5: SyntaxWarning: invalid escape sequence '\\h'\n",
|
||||
"C:\\Users\\MaD\\AppData\\Local\\Temp\\ipykernel_7684\\1678678831.py:3: SyntaxWarning: invalid escape sequence '\\S'\n",
|
||||
" students_data = pd.read_csv(\"../data\\StudentsPerformance.csv\")\n",
|
||||
"C:\\Users\\MaD\\AppData\\Local\\Temp\\ipykernel_7684\\1678678831.py:4: SyntaxWarning: invalid escape sequence '\\D'\n",
|
||||
" diamonds_data = pd.read_csv(\"../data\\Diamonds Prices2022.csv\")\n",
|
||||
"C:\\Users\\MaD\\AppData\\Local\\Temp\\ipykernel_7684\\1678678831.py:5: SyntaxWarning: invalid escape sequence '\\h'\n",
|
||||
" stroke_data = pd.read_csv(\"../data\\healthcare-dataset-stroke-data.csv\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"--- Students ---\n",
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"RangeIndex: 1000 entries, 0 to 999\n",
|
||||
"Data columns (total 8 columns):\n",
|
||||
" # Column Non-Null Count Dtype \n",
|
||||
"--- ------ -------------- ----- \n",
|
||||
" 0 gender 1000 non-null object\n",
|
||||
" 1 race/ethnicity 1000 non-null object\n",
|
||||
" 2 parental level of education 1000 non-null object\n",
|
||||
" 3 lunch 1000 non-null object\n",
|
||||
" 4 test preparation course 1000 non-null object\n",
|
||||
" 5 math score 1000 non-null int64 \n",
|
||||
" 6 reading score 1000 non-null int64 \n",
|
||||
" 7 writing score 1000 non-null int64 \n",
|
||||
"dtypes: int64(3), object(5)\n",
|
||||
"memory usage: 62.6+ KB\n",
|
||||
"None\n",
|
||||
"gender 0\n",
|
||||
"race/ethnicity 0\n",
|
||||
"parental level of education 0\n",
|
||||
"lunch 0\n",
|
||||
"test preparation course 0\n",
|
||||
"math score 0\n",
|
||||
"reading score 0\n",
|
||||
"writing score 0\n",
|
||||
"dtype: int64\n",
|
||||
" math score reading score writing score\n",
|
||||
"count 1000.00000 1000.000000 1000.000000\n",
|
||||
"mean 66.08900 69.169000 68.054000\n",
|
||||
"std 15.16308 14.600192 15.195657\n",
|
||||
"min 0.00000 17.000000 10.000000\n",
|
||||
"25% 57.00000 59.000000 57.750000\n",
|
||||
"50% 66.00000 70.000000 69.000000\n",
|
||||
"75% 77.00000 79.000000 79.000000\n",
|
||||
"max 100.00000 100.000000 100.000000\n",
|
||||
"--- Diamonds ---\n",
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"RangeIndex: 53943 entries, 0 to 53942\n",
|
||||
"Data columns (total 11 columns):\n",
|
||||
" # Column Non-Null Count Dtype \n",
|
||||
"--- ------ -------------- ----- \n",
|
||||
" 0 Unnamed: 0 53943 non-null int64 \n",
|
||||
" 1 carat 53943 non-null float64\n",
|
||||
" 2 cut 53943 non-null object \n",
|
||||
" 3 color 53943 non-null object \n",
|
||||
" 4 clarity 53943 non-null object \n",
|
||||
" 5 depth 53943 non-null float64\n",
|
||||
" 6 table 53943 non-null float64\n",
|
||||
" 7 price 53943 non-null int64 \n",
|
||||
" 8 x 53943 non-null float64\n",
|
||||
" 9 y 53943 non-null float64\n",
|
||||
" 10 z 53943 non-null float64\n",
|
||||
"dtypes: float64(6), int64(2), object(3)\n",
|
||||
"memory usage: 4.5+ MB\n",
|
||||
"None\n",
|
||||
"Unnamed: 0 0\n",
|
||||
"carat 0\n",
|
||||
"cut 0\n",
|
||||
"color 0\n",
|
||||
"clarity 0\n",
|
||||
"depth 0\n",
|
||||
"table 0\n",
|
||||
"price 0\n",
|
||||
"x 0\n",
|
||||
"y 0\n",
|
||||
"z 0\n",
|
||||
"dtype: int64\n",
|
||||
" Unnamed: 0 carat depth table price \\\n",
|
||||
"count 53943.000000 53943.000000 53943.000000 53943.000000 53943.000000 \n",
|
||||
"mean 26972.000000 0.797935 61.749322 57.457251 3932.734294 \n",
|
||||
"std 15572.147122 0.473999 1.432626 2.234549 3989.338447 \n",
|
||||
"min 1.000000 0.200000 43.000000 43.000000 326.000000 \n",
|
||||
"25% 13486.500000 0.400000 61.000000 56.000000 950.000000 \n",
|
||||
"50% 26972.000000 0.700000 61.800000 57.000000 2401.000000 \n",
|
||||
"75% 40457.500000 1.040000 62.500000 59.000000 5324.000000 \n",
|
||||
"max 53943.000000 5.010000 79.000000 95.000000 18823.000000 \n",
|
||||
"\n",
|
||||
" x y z \n",
|
||||
"count 53943.000000 53943.000000 53943.000000 \n",
|
||||
"mean 5.731158 5.734526 3.538730 \n",
|
||||
"std 1.121730 1.142103 0.705679 \n",
|
||||
"min 0.000000 0.000000 0.000000 \n",
|
||||
"25% 4.710000 4.720000 2.910000 \n",
|
||||
"50% 5.700000 5.710000 3.530000 \n",
|
||||
"75% 6.540000 6.540000 4.040000 \n",
|
||||
"max 10.740000 58.900000 31.800000 \n",
|
||||
"--- Stroke ---\n",
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"RangeIndex: 5110 entries, 0 to 5109\n",
|
||||
"Data columns (total 12 columns):\n",
|
||||
" # Column Non-Null Count Dtype \n",
|
||||
"--- ------ -------------- ----- \n",
|
||||
" 0 id 5110 non-null int64 \n",
|
||||
" 1 gender 5110 non-null object \n",
|
||||
" 2 age 5110 non-null float64\n",
|
||||
" 3 hypertension 5110 non-null int64 \n",
|
||||
" 4 heart_disease 5110 non-null int64 \n",
|
||||
" 5 ever_married 5110 non-null object \n",
|
||||
" 6 work_type 5110 non-null object \n",
|
||||
" 7 Residence_type 5110 non-null object \n",
|
||||
" 8 avg_glucose_level 5110 non-null float64\n",
|
||||
" 9 bmi 4909 non-null float64\n",
|
||||
" 10 smoking_status 5110 non-null object \n",
|
||||
" 11 stroke 5110 non-null int64 \n",
|
||||
"dtypes: float64(3), int64(4), object(5)\n",
|
||||
"memory usage: 479.2+ KB\n",
|
||||
"None\n",
|
||||
"id 0\n",
|
||||
"gender 0\n",
|
||||
"age 0\n",
|
||||
"hypertension 0\n",
|
||||
"heart_disease 0\n",
|
||||
"ever_married 0\n",
|
||||
"work_type 0\n",
|
||||
"Residence_type 0\n",
|
||||
"avg_glucose_level 0\n",
|
||||
"bmi 201\n",
|
||||
"smoking_status 0\n",
|
||||
"stroke 0\n",
|
||||
"dtype: int64\n",
|
||||
" id age hypertension heart_disease \\\n",
|
||||
"count 5110.000000 5110.000000 5110.000000 5110.000000 \n",
|
||||
"mean 36517.829354 43.226614 0.097456 0.054012 \n",
|
||||
"std 21161.721625 22.612647 0.296607 0.226063 \n",
|
||||
"min 67.000000 0.080000 0.000000 0.000000 \n",
|
||||
"25% 17741.250000 25.000000 0.000000 0.000000 \n",
|
||||
"50% 36932.000000 45.000000 0.000000 0.000000 \n",
|
||||
"75% 54682.000000 61.000000 0.000000 0.000000 \n",
|
||||
"max 72940.000000 82.000000 1.000000 1.000000 \n",
|
||||
"\n",
|
||||
" avg_glucose_level bmi stroke \n",
|
||||
"count 5110.000000 4909.000000 5110.000000 \n",
|
||||
"mean 106.147677 28.893237 0.048728 \n",
|
||||
"std 45.283560 7.854067 0.215320 \n",
|
||||
"min 55.120000 10.300000 0.000000 \n",
|
||||
"25% 77.245000 23.500000 0.000000 \n",
|
||||
"50% 91.885000 28.100000 0.000000 \n",
|
||||
"75% 114.090000 33.100000 0.000000 \n",
|
||||
"max 271.740000 97.600000 1.000000 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"# Загрузка датасетов\n",
|
||||
"students_data = pd.read_csv(\"../data\\StudentsPerformance.csv\")\n",
|
||||
"diamonds_data = pd.read_csv(\"../data\\Diamonds Prices2022.csv\")\n",
|
||||
"stroke_data = pd.read_csv(\"../data\\healthcare-dataset-stroke-data.csv\")\n",
|
||||
"\n",
|
||||
"# Информация о наборах данных\n",
|
||||
"for name, dataset in {\"Students\": students_data, \"Diamonds\": diamonds_data, \"Stroke\": stroke_data}.items():\n",
|
||||
" print(f\"--- {name} ---\")\n",
|
||||
" print(dataset.info())\n",
|
||||
" print(dataset.isnull().sum())\n",
|
||||
" print(dataset.describe())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Шаг 3. Примеры бизнес-целей\n",
|
||||
"Оценки студентов: Рекомендации по обучению для повышения успеваемости.\n",
|
||||
"Цены на бриллианты: Оценка стоимости для продаж.\n",
|
||||
"Данные по инсультам: повышения эффективности здравоохранения"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Шаг 4. Цели технического проекта\n",
|
||||
"Примеры:\n",
|
||||
"Оценки студентов:\n",
|
||||
"Вход: Данные о студентах.\n",
|
||||
"Целевой признак: Оценки.\n",
|
||||
"Цены на бриллианты:\n",
|
||||
"Вход: Характеристики бриллиантов.\n",
|
||||
"Целевой признак: Цена.\n",
|
||||
"Данные по инсультам:\n",
|
||||
"Вход: Данные о пациентах.\n",
|
||||
"Целевой признак: Наличие инсульта."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"--- Students ---\n",
|
||||
"Пропущенные значения:\n",
|
||||
" gender 0\n",
|
||||
"race/ethnicity 0\n",
|
||||
"parental level of education 0\n",
|
||||
"lunch 0\n",
|
||||
"test preparation course 0\n",
|
||||
"math score 0\n",
|
||||
"reading score 0\n",
|
||||
"writing score 0\n",
|
||||
"dtype: int64\n",
|
||||
"Выбросы и статистика:\n",
|
||||
" math score reading score writing score\n",
|
||||
"count 1000.00000 1000.000000 1000.000000\n",
|
||||
"mean 66.08900 69.169000 68.054000\n",
|
||||
"std 15.16308 14.600192 15.195657\n",
|
||||
"min 0.00000 17.000000 10.000000\n",
|
||||
"25% 57.00000 59.000000 57.750000\n",
|
||||
"50% 66.00000 70.000000 69.000000\n",
|
||||
"75% 77.00000 79.000000 79.000000\n",
|
||||
"max 100.00000 100.000000 100.000000\n",
|
||||
"--- Diamonds ---\n",
|
||||
"Пропущенные значения:\n",
|
||||
" Unnamed: 0 0\n",
|
||||
"carat 0\n",
|
||||
"cut 0\n",
|
||||
"color 0\n",
|
||||
"clarity 0\n",
|
||||
"depth 0\n",
|
||||
"table 0\n",
|
||||
"price 0\n",
|
||||
"x 0\n",
|
||||
"y 0\n",
|
||||
"z 0\n",
|
||||
"dtype: int64\n",
|
||||
"Выбросы и статистика:\n",
|
||||
" Unnamed: 0 carat depth table price \\\n",
|
||||
"count 53943.000000 53943.000000 53943.000000 53943.000000 53943.000000 \n",
|
||||
"mean 26972.000000 0.797935 61.749322 57.457251 3932.734294 \n",
|
||||
"std 15572.147122 0.473999 1.432626 2.234549 3989.338447 \n",
|
||||
"min 1.000000 0.200000 43.000000 43.000000 326.000000 \n",
|
||||
"25% 13486.500000 0.400000 61.000000 56.000000 950.000000 \n",
|
||||
"50% 26972.000000 0.700000 61.800000 57.000000 2401.000000 \n",
|
||||
"75% 40457.500000 1.040000 62.500000 59.000000 5324.000000 \n",
|
||||
"max 53943.000000 5.010000 79.000000 95.000000 18823.000000 \n",
|
||||
"\n",
|
||||
" x y z \n",
|
||||
"count 53943.000000 53943.000000 53943.000000 \n",
|
||||
"mean 5.731158 5.734526 3.538730 \n",
|
||||
"std 1.121730 1.142103 0.705679 \n",
|
||||
"min 0.000000 0.000000 0.000000 \n",
|
||||
"25% 4.710000 4.720000 2.910000 \n",
|
||||
"50% 5.700000 5.710000 3.530000 \n",
|
||||
"75% 6.540000 6.540000 4.040000 \n",
|
||||
"max 10.740000 58.900000 31.800000 \n",
|
||||
"--- Stroke ---\n",
|
||||
"Пропущенные значения:\n",
|
||||
" id 0\n",
|
||||
"gender 0\n",
|
||||
"age 0\n",
|
||||
"hypertension 0\n",
|
||||
"heart_disease 0\n",
|
||||
"ever_married 0\n",
|
||||
"work_type 0\n",
|
||||
"Residence_type 0\n",
|
||||
"avg_glucose_level 0\n",
|
||||
"bmi 201\n",
|
||||
"smoking_status 0\n",
|
||||
"stroke 0\n",
|
||||
"dtype: int64\n",
|
||||
"Выбросы и статистика:\n",
|
||||
" id age hypertension heart_disease \\\n",
|
||||
"count 5110.000000 5110.000000 5110.000000 5110.000000 \n",
|
||||
"mean 36517.829354 43.226614 0.097456 0.054012 \n",
|
||||
"std 21161.721625 22.612647 0.296607 0.226063 \n",
|
||||
"min 67.000000 0.080000 0.000000 0.000000 \n",
|
||||
"25% 17741.250000 25.000000 0.000000 0.000000 \n",
|
||||
"50% 36932.000000 45.000000 0.000000 0.000000 \n",
|
||||
"75% 54682.000000 61.000000 0.000000 0.000000 \n",
|
||||
"max 72940.000000 82.000000 1.000000 1.000000 \n",
|
||||
"\n",
|
||||
" avg_glucose_level bmi stroke \n",
|
||||
"count 5110.000000 4909.000000 5110.000000 \n",
|
||||
"mean 106.147677 28.893237 0.048728 \n",
|
||||
"std 45.283560 7.854067 0.215320 \n",
|
||||
"min 55.120000 10.300000 0.000000 \n",
|
||||
"25% 77.245000 23.500000 0.000000 \n",
|
||||
"50% 91.885000 28.100000 0.000000 \n",
|
||||
"75% 114.090000 33.100000 0.000000 \n",
|
||||
"max 271.740000 97.600000 1.000000 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Анализ выбросов и пропущенных данных\n",
|
||||
"for name, dataset in {\"Students\": students_data, \"Diamonds\": diamonds_data, \"Stroke\": stroke_data}.items():\n",
|
||||
" print(f\"--- {name} ---\")\n",
|
||||
" print(\"Пропущенные значения:\\n\", dataset.isnull().sum())\n",
|
||||
" print(\"Выбросы и статистика:\\n\", dataset.describe())\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"--- Students ---\n",
|
||||
"Пропущенные значения:\n",
|
||||
" gender 0\n",
|
||||
"race/ethnicity 0\n",
|
||||
"parental level of education 0\n",
|
||||
"lunch 0\n",
|
||||
"test preparation course 0\n",
|
||||
"math score 0\n",
|
||||
"reading score 0\n",
|
||||
"writing score 0\n",
|
||||
"dtype: int64\n",
|
||||
"Выбросы и статистика:\n",
|
||||
" math score reading score writing score\n",
|
||||
"count 1000.00000 1000.000000 1000.000000\n",
|
||||
"mean 66.08900 69.169000 68.054000\n",
|
||||
"std 15.16308 14.600192 15.195657\n",
|
||||
"min 0.00000 17.000000 10.000000\n",
|
||||
"25% 57.00000 59.000000 57.750000\n",
|
||||
"50% 66.00000 70.000000 69.000000\n",
|
||||
"75% 77.00000 79.000000 79.000000\n",
|
||||
"max 100.00000 100.000000 100.000000\n",
|
||||
"--- Diamonds ---\n",
|
||||
"Пропущенные значения:\n",
|
||||
" Unnamed: 0 0\n",
|
||||
"carat 0\n",
|
||||
"cut 0\n",
|
||||
"color 0\n",
|
||||
"clarity 0\n",
|
||||
"depth 0\n",
|
||||
"table 0\n",
|
||||
"price 0\n",
|
||||
"x 0\n",
|
||||
"y 0\n",
|
||||
"z 0\n",
|
||||
"dtype: int64\n",
|
||||
"Выбросы и статистика:\n",
|
||||
" Unnamed: 0 carat depth table price \\\n",
|
||||
"count 53943.000000 53943.000000 53943.000000 53943.000000 53943.000000 \n",
|
||||
"mean 26972.000000 0.797935 61.749322 57.457251 3932.734294 \n",
|
||||
"std 15572.147122 0.473999 1.432626 2.234549 3989.338447 \n",
|
||||
"min 1.000000 0.200000 43.000000 43.000000 326.000000 \n",
|
||||
"25% 13486.500000 0.400000 61.000000 56.000000 950.000000 \n",
|
||||
"50% 26972.000000 0.700000 61.800000 57.000000 2401.000000 \n",
|
||||
"75% 40457.500000 1.040000 62.500000 59.000000 5324.000000 \n",
|
||||
"max 53943.000000 5.010000 79.000000 95.000000 18823.000000 \n",
|
||||
"\n",
|
||||
" x y z \n",
|
||||
"count 53943.000000 53943.000000 53943.000000 \n",
|
||||
"mean 5.731158 5.734526 3.538730 \n",
|
||||
"std 1.121730 1.142103 0.705679 \n",
|
||||
"min 0.000000 0.000000 0.000000 \n",
|
||||
"25% 4.710000 4.720000 2.910000 \n",
|
||||
"50% 5.700000 5.710000 3.530000 \n",
|
||||
"75% 6.540000 6.540000 4.040000 \n",
|
||||
"max 10.740000 58.900000 31.800000 \n",
|
||||
"--- Stroke ---\n",
|
||||
"Пропущенные значения:\n",
|
||||
" id 0\n",
|
||||
"gender 0\n",
|
||||
"age 0\n",
|
||||
"hypertension 0\n",
|
||||
"heart_disease 0\n",
|
||||
"ever_married 0\n",
|
||||
"work_type 0\n",
|
||||
"Residence_type 0\n",
|
||||
"avg_glucose_level 0\n",
|
||||
"bmi 201\n",
|
||||
"smoking_status 0\n",
|
||||
"stroke 0\n",
|
||||
"dtype: int64\n",
|
||||
"Выбросы и статистика:\n",
|
||||
" id age hypertension heart_disease \\\n",
|
||||
"count 5110.000000 5110.000000 5110.000000 5110.000000 \n",
|
||||
"mean 36517.829354 43.226614 0.097456 0.054012 \n",
|
||||
"std 21161.721625 22.612647 0.296607 0.226063 \n",
|
||||
"min 67.000000 0.080000 0.000000 0.000000 \n",
|
||||
"25% 17741.250000 25.000000 0.000000 0.000000 \n",
|
||||
"50% 36932.000000 45.000000 0.000000 0.000000 \n",
|
||||
"75% 54682.000000 61.000000 0.000000 0.000000 \n",
|
||||
"max 72940.000000 82.000000 1.000000 1.000000 \n",
|
||||
"\n",
|
||||
" avg_glucose_level bmi stroke \n",
|
||||
"count 5110.000000 4909.000000 5110.000000 \n",
|
||||
"mean 106.147677 28.893237 0.048728 \n",
|
||||
"std 45.283560 7.854067 0.215320 \n",
|
||||
"min 55.120000 10.300000 0.000000 \n",
|
||||
"25% 77.245000 23.500000 0.000000 \n",
|
||||
"50% 91.885000 28.100000 0.000000 \n",
|
||||
"75% 114.090000 33.100000 0.000000 \n",
|
||||
"max 271.740000 97.600000 1.000000 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Анализ выбросов и пропущенных данных\n",
|
||||
"for name, dataset in {\"Students\": students_data, \"Diamonds\": diamonds_data, \"Stroke\": stroke_data}.items():\n",
|
||||
" print(f\"--- {name} ---\")\n",
|
||||
" print(\"Пропущенные значения:\\n\", dataset.isnull().sum())\n",
|
||||
" print(\"Выбросы и статистика:\\n\", dataset.describe())\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"students_data = students_data.dropna()\n",
|
||||
"diamonds_data = diamonds_data.dropna()\n",
|
||||
"stroke_data = stroke_data.dropna()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Данные в целом довольно качественные, без значителньых выбросов или пропущенных значений. Информативность находится на достаточном уровне, датасеты пригодны для дальнейшего анализа"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"--- Students ---\n",
|
||||
"Статистика после очистки:\n",
|
||||
" math score reading score writing score\n",
|
||||
"count 1000.00000 1000.000000 1000.000000\n",
|
||||
"mean 66.08900 69.169000 68.054000\n",
|
||||
"std 15.16308 14.600192 15.195657\n",
|
||||
"min 0.00000 17.000000 10.000000\n",
|
||||
"25% 57.00000 59.000000 57.750000\n",
|
||||
"50% 66.00000 70.000000 69.000000\n",
|
||||
"75% 77.00000 79.000000 79.000000\n",
|
||||
"max 100.00000 100.000000 100.000000\n",
|
||||
"--- Diamonds ---\n",
|
||||
"Статистика после очистки:\n",
|
||||
" Unnamed: 0 carat depth table price \\\n",
|
||||
"count 53943.000000 53943.000000 53943.000000 53943.000000 53943.000000 \n",
|
||||
"mean 26972.000000 0.797935 61.749322 57.457251 3932.734294 \n",
|
||||
"std 15572.147122 0.473999 1.432626 2.234549 3989.338447 \n",
|
||||
"min 1.000000 0.200000 43.000000 43.000000 326.000000 \n",
|
||||
"25% 13486.500000 0.400000 61.000000 56.000000 950.000000 \n",
|
||||
"50% 26972.000000 0.700000 61.800000 57.000000 2401.000000 \n",
|
||||
"75% 40457.500000 1.040000 62.500000 59.000000 5324.000000 \n",
|
||||
"max 53943.000000 5.010000 79.000000 95.000000 18823.000000 \n",
|
||||
"\n",
|
||||
" x y z \n",
|
||||
"count 53943.000000 53943.000000 53943.000000 \n",
|
||||
"mean 5.731158 5.734526 3.538730 \n",
|
||||
"std 1.121730 1.142103 0.705679 \n",
|
||||
"min 0.000000 0.000000 0.000000 \n",
|
||||
"25% 4.710000 4.720000 2.910000 \n",
|
||||
"50% 5.700000 5.710000 3.530000 \n",
|
||||
"75% 6.540000 6.540000 4.040000 \n",
|
||||
"max 10.740000 58.900000 31.800000 \n",
|
||||
"--- Stroke ---\n",
|
||||
"Статистика после очистки:\n",
|
||||
" id age hypertension heart_disease \\\n",
|
||||
"count 4909.000000 4909.000000 4909.000000 4909.000000 \n",
|
||||
"mean 37064.313506 42.865374 0.091872 0.049501 \n",
|
||||
"std 20995.098457 22.555115 0.288875 0.216934 \n",
|
||||
"min 77.000000 0.080000 0.000000 0.000000 \n",
|
||||
"25% 18605.000000 25.000000 0.000000 0.000000 \n",
|
||||
"50% 37608.000000 44.000000 0.000000 0.000000 \n",
|
||||
"75% 55220.000000 60.000000 0.000000 0.000000 \n",
|
||||
"max 72940.000000 82.000000 1.000000 1.000000 \n",
|
||||
"\n",
|
||||
" avg_glucose_level bmi stroke \n",
|
||||
"count 4909.000000 4909.000000 4909.000000 \n",
|
||||
"mean 105.305150 28.893237 0.042575 \n",
|
||||
"std 44.424341 7.854067 0.201917 \n",
|
||||
"min 55.120000 10.300000 0.000000 \n",
|
||||
"25% 77.070000 23.500000 0.000000 \n",
|
||||
"50% 91.680000 28.100000 0.000000 \n",
|
||||
"75% 113.570000 33.100000 0.000000 \n",
|
||||
"max 271.740000 97.600000 1.000000 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for name, dataset in {\"Students\": students_data, \"Diamonds\": diamonds_data, \"Stroke\": stroke_data}.items():\n",
|
||||
" print(f\"--- {name} ---\")\n",
|
||||
" print(\"Статистика после очистки:\\n\", dataset.describe())\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"students_train, students_test = train_test_split(students_data, test_size=0.2, stratify=students_data['gender'])\n",
|
||||
"diamonds_train, diamonds_test = train_test_split(diamonds_data, test_size=0.2, stratify=diamonds_data['cut'])\n",
|
||||
"stroke_train, stroke_test = train_test_split(stroke_data, test_size=0.2, stratify=stroke_data['stroke'])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from imblearn.over_sampling import RandomOverSampler\n",
|
||||
"\n",
|
||||
"# Балансировка данных\n",
|
||||
"oversampler = RandomOverSampler(random_state=42)\n",
|
||||
"X_train, y_train = oversampler.fit_resample(stroke_train.drop(columns=['stroke']), stroke_train['stroke'])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from imblearn.under_sampling import RandomUnderSampler\n",
|
||||
"\n",
|
||||
"# Пример oversampling и undersampling\n",
|
||||
"oversampler = RandomOverSampler(random_state=42)\n",
|
||||
"undersampler = RandomUnderSampler(random_state=42)\n",
|
||||
"\n",
|
||||
"X_over, y_over = oversampler.fit_resample(X_train, y_train)\n",
|
||||
"X_under, y_under = undersampler.fit_resample(X_train, y_train)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
326
Lab_3/lab3.ipynb
326
Lab_3/lab3.ipynb
File diff suppressed because one or more lines are too long
451
Lab_4/lab4.ipynb
451
Lab_4/lab4.ipynb
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
320
Lab_5/lab5.ipynb
Normal file
320
Lab_5/lab5.ipynb
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user