diff --git a/lab2/lab2.ipynb b/lab2/lab2.ipynb
index 357c70e..d29f23c 100644
--- a/lab2/lab2.ipynb
+++ b/lab2/lab2.ipynb
@@ -36,16 +36,23 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 1,
"metadata": {},
"outputs": [
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',\n",
- " 'price', 'x', 'y', 'z'],\n",
- " dtype='object')\n"
+ "ename": "KeyboardInterrupt",
+ "evalue": "",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[1;32mIn[1], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m../data/Diamonds-Prices.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(df\u001b[38;5;241m.\u001b[39mcolumns)\n",
+ "File \u001b[1;32mc:\\Users\\bocchanskyy\\source\\repos\\MAI_PIbd-33_Volkov_NA\\.venv\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[0;32m 1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[0;32m 1014\u001b[0m dialect,\n\u001b[0;32m 1015\u001b[0m delimiter,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1022\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[0;32m 1023\u001b[0m )\n\u001b[0;32m 1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[1;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[1;32mc:\\Users\\bocchanskyy\\source\\repos\\MAI_PIbd-33_Volkov_NA\\.venv\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[0;32m 619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[1;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[0;32m 623\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
+ "File \u001b[1;32mc:\\Users\\bocchanskyy\\source\\repos\\MAI_PIbd-33_Volkov_NA\\.venv\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m 1617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[1;32mc:\\Users\\bocchanskyy\\source\\repos\\MAI_PIbd-33_Volkov_NA\\.venv\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1898\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[1;34m(self, f, engine)\u001b[0m\n\u001b[0;32m 1895\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(msg)\n\u001b[0;32m 1897\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m-> 1898\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmapping\u001b[49m\u001b[43m[\u001b[49m\u001b[43mengine\u001b[49m\u001b[43m]\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1899\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[0;32m 1900\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
+ "File \u001b[1;32mc:\\Users\\bocchanskyy\\source\\repos\\MAI_PIbd-33_Volkov_NA\\.venv\\Lib\\site-packages\\pandas\\io\\parsers\\c_parser_wrapper.py:93\u001b[0m, in \u001b[0;36mCParserWrapper.__init__\u001b[1;34m(self, src, **kwds)\u001b[0m\n\u001b[0;32m 90\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype_backend\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyarrow\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m 91\u001b[0m \u001b[38;5;66;03m# Fail here loudly instead of in cython after reading\u001b[39;00m\n\u001b[0;32m 92\u001b[0m import_optional_dependency(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyarrow\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m---> 93\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reader \u001b[38;5;241m=\u001b[39m \u001b[43mparsers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mTextReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43msrc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 95\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39munnamed_cols \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reader\u001b[38;5;241m.\u001b[39munnamed_cols\n\u001b[0;32m 97\u001b[0m \u001b[38;5;66;03m# error: Cannot determine type of 'names'\u001b[39;00m\n",
+ "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
]
}
],
@@ -1432,7 +1439,7 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -1473,7 +1480,7 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -1519,47 +1526,83 @@
],
"source": [
"import seaborn as sns\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
"\n",
"plt.figure(figsize=(10, 6))\n",
"\n",
+ "\n",
"# Связь между возрастом и состоянием\n",
+ "\n",
"plt.subplot(2, 2, 1)\n",
+ "\n",
"sns.scatterplot(data=df, x=\"Age\", y=\"Networth\")\n",
+ "\n",
"plt.title(\"Связь между возрастом и состоянием\")\n",
+ "\n",
"plt.xlabel(\"Возраст\")\n",
+ "\n",
"plt.ylabel(\"Состояние (млрд)\")\n",
+ "\n",
"plt.show()\n",
"\n",
"\n",
+ "\n",
"# Связь между страной проживания и состоянием (топ-10 стран)\n",
+ "\n",
"plt.subplot(2, 2, 2)\n",
+ "\n",
"top_countries = df[\"Country\"].value_counts().index[:10]\n",
+ "\n",
"sns.boxplot(data=df[df[\"Country\"].isin(top_countries)], x=\"Country\", y=\"Networth\")\n",
+ "\n",
"plt.title(\"Связь между страной проживания и состоянием\")\n",
+ "\n",
"plt.xticks(rotation=90)\n",
+ "\n",
"plt.xlabel(\"Страна\")\n",
+ "\n",
"plt.ylabel(\"Состояние (млрд)\")\n",
+ "\n",
"plt.show()\n",
"\n",
"\n",
+ "\n",
"# Связь между источником дохода и состоянием (топ-10 источников дохода)\n",
+ "\n",
"plt.subplot(2, 2, 3)\n",
+ "\n",
"top_sources = df[\"Source\"].value_counts().index[:10]\n",
+ "\n",
"sns.boxplot(data=df[df[\"Source\"].isin(top_sources)], x=\"Source\", y=\"Networth\")\n",
+ "\n",
"plt.title(\"Связь между источником дохода и состоянием\")\n",
+ "\n",
"plt.xticks(rotation=90)\n",
+ "\n",
"plt.xlabel(\"Источник дохода\")\n",
+ "\n",
"plt.ylabel(\"Состояние (млрд)\")\n",
+ "\n",
"plt.show()\n",
"\n",
+ "\n",
"# Связь между отраслью и состоянием (топ-10 отраслей)\n",
+ "\n",
"plt.subplot(2, 2, 4)\n",
+ "\n",
"top_industries = df[\"Industry\"].value_counts().index[:10]\n",
+ "\n",
"sns.boxplot(data=df[df[\"Industry\"].isin(top_industries)], x=\"Industry\", y=\"Networth\")\n",
+ "\n",
"plt.title(\"Связь между отраслью и состоянием\")\n",
+ "\n",
"plt.xticks(rotation=90)\n",
+ "\n",
"plt.xlabel(\"Отрасль\")\n",
+ "\n",
"plt.ylabel(\"Состояние (млрд)\")\n",
+ "\n",
"plt.show()"
]
},
@@ -1572,7 +1615,7 @@
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -1685,7 +1728,7 @@
"max 2578.000000 219.000000 100.000000"
]
},
- "execution_count": 28,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -1711,7 +1754,7 @@
},
{
"cell_type": "code",
- "execution_count": 30,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
@@ -1754,7 +1797,7 @@
},
{
"cell_type": "code",
- "execution_count": 32,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
@@ -1788,7 +1831,7 @@
},
{
"cell_type": "code",
- "execution_count": 33,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
@@ -1824,29 +1867,51 @@
],
"source": [
"# 1. Столбчатая диаграмма по странам\n",
+ "\n",
"plt.figure(figsize=(12, 6))\n",
+ "\n",
"sns.countplot(data=df, x=\"Country\", order=df[\"Country\"].value_counts().index)\n",
+ "\n",
"plt.title(\"Количество людей по странам\")\n",
+ "\n",
"plt.xlabel(\"Страна\")\n",
+ "\n",
"plt.ylabel(\"Количество\")\n",
+ "\n",
"plt.xticks(rotation=45)\n",
+ "\n",
"plt.show()\n",
"\n",
+ "\n",
"# 2. Столбчатая диаграмма по отраслям\n",
+ "\n",
"plt.figure(figsize=(12, 6))\n",
+ "\n",
"sns.countplot(data=df, x=\"Industry\", order=df[\"Industry\"].value_counts().index)\n",
+ "\n",
"plt.title(\"Количество людей по отраслям\")\n",
+ "\n",
"plt.xlabel(\"Отрасль\")\n",
+ "\n",
"plt.ylabel(\"Количество\")\n",
+ "\n",
"plt.xticks(rotation=45)\n",
+ "\n",
"plt.show()\n",
"\n",
+ "\n",
"# 3. Гистограмма для анализа возраста\n",
+ "\n",
"plt.figure(figsize=(10, 5))\n",
+ "\n",
"sns.histplot(df[\"Age\"], bins=30, kde=True)\n",
+ "\n",
"plt.title(\"Распределение возраста\")\n",
+ "\n",
"plt.xlabel(\"Возраст\")\n",
+ "\n",
"plt.ylabel(\"Частота\")\n",
+ "\n",
"plt.show()"
]
},
@@ -1866,7 +1931,7 @@
},
{
"cell_type": "code",
- "execution_count": 49,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
@@ -1875,7 +1940,7 @@
"((1560, 6), (520, 6), (520, 6))"
]
},
- "execution_count": 49,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -1901,7 +1966,7 @@
},
{
"cell_type": "code",
- "execution_count": 39,
+ "execution_count": 13,
"metadata": {},
"outputs": [
{
@@ -1936,7 +2001,7 @@
" Name: Networth, dtype: float64)"
]
},
- "execution_count": 39,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -1952,9 +2017,21 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "ename": "ImportError",
+ "evalue": "cannot import name 'SMOTEREG' from 'imblearn.over_sampling' (c:\\Users\\bocchanskyy\\source\\repos\\MAI_PIbd-33_Volkov_NA\\.venv\\Lib\\site-packages\\imblearn\\over_sampling\\__init__.py)",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mImportError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[1;32mIn[16], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mimblearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mover_sampling\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SMOTEREG\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mimblearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01munder_sampling\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m RandomUnderSampler\n\u001b[0;32m 4\u001b[0m oversampler \u001b[38;5;241m=\u001b[39m SMOTEREG(random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m12\u001b[39m)\n",
+ "\u001b[1;31mImportError\u001b[0m: cannot import name 'SMOTEREG' from 'imblearn.over_sampling' (c:\\Users\\bocchanskyy\\source\\repos\\MAI_PIbd-33_Volkov_NA\\.venv\\Lib\\site-packages\\imblearn\\over_sampling\\__init__.py)"
+ ]
+ }
+ ],
"source": [
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
@@ -1980,9 +2057,9 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python (.venv)",
+ "display_name": ".venv",
"language": "python",
- "name": ".venv"
+ "name": "python3"
},
"language_info": {
"codemirror_mode": {
diff --git a/lab3/lab3.ipynb b/lab3/lab3.ipynb
new file mode 100644
index 0000000..9b892a8
--- /dev/null
+++ b/lab3/lab3.ipynb
@@ -0,0 +1,1121 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Вариант 4. Данные по инсультам"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',\n",
+ " 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',\n",
+ " 'smoking_status', 'stroke'],\n",
+ " dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " gender | \n",
+ " age | \n",
+ " hypertension | \n",
+ " heart_disease | \n",
+ " ever_married | \n",
+ " work_type | \n",
+ " Residence_type | \n",
+ " avg_glucose_level | \n",
+ " bmi | \n",
+ " smoking_status | \n",
+ " stroke | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 9046 | \n",
+ " Male | \n",
+ " 67.0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Yes | \n",
+ " Private | \n",
+ " Urban | \n",
+ " 228.69 | \n",
+ " 36.6 | \n",
+ " formerly smoked | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 51676 | \n",
+ " Female | \n",
+ " 61.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Yes | \n",
+ " Self-employed | \n",
+ " Rural | \n",
+ " 202.21 | \n",
+ " NaN | \n",
+ " never smoked | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 31112 | \n",
+ " Male | \n",
+ " 80.0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Yes | \n",
+ " Private | \n",
+ " Rural | \n",
+ " 105.92 | \n",
+ " 32.5 | \n",
+ " never smoked | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 60182 | \n",
+ " Female | \n",
+ " 49.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Yes | \n",
+ " Private | \n",
+ " Urban | \n",
+ " 171.23 | \n",
+ " 34.4 | \n",
+ " smokes | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1665 | \n",
+ " Female | \n",
+ " 79.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " Yes | \n",
+ " Self-employed | \n",
+ " Rural | \n",
+ " 174.12 | \n",
+ " 24.0 | \n",
+ " never smoked | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id gender age hypertension heart_disease ever_married \\\n",
+ "0 9046 Male 67.0 0 1 Yes \n",
+ "1 51676 Female 61.0 0 0 Yes \n",
+ "2 31112 Male 80.0 0 1 Yes \n",
+ "3 60182 Female 49.0 0 0 Yes \n",
+ "4 1665 Female 79.0 1 0 Yes \n",
+ "\n",
+ " work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
+ "0 Private Urban 228.69 36.6 formerly smoked \n",
+ "1 Self-employed Rural 202.21 NaN never smoked \n",
+ "2 Private Rural 105.92 32.5 never smoked \n",
+ "3 Private Urban 171.23 34.4 smokes \n",
+ "4 Self-employed Rural 174.12 24.0 never smoked \n",
+ "\n",
+ " stroke \n",
+ "0 1 \n",
+ "1 1 \n",
+ "2 1 \n",
+ "3 1 \n",
+ "4 1 "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from imblearn.over_sampling import RandomOverSampler\n",
+ "from sklearn.preprocessing import StandardScaler\n",
+ "import featuretools as ft\n",
+ "import time\n",
+ "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "from sklearn.model_selection import cross_val_score\n",
+ "\n",
+ "df = pd.read_csv(\"../data/healthcare-dataset-stroke-data.csv\")\n",
+ "\n",
+ "print(df.columns)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Бизнес цели и цели технического проекта.\n",
+ "## Бизнес цели:\n",
+ "### 1. Предсказание инсульта: Разработать систему, которая сможет предсказать вероятность инсульта у пациентов на основе их медицинских и социальных данных. Это может помочь медицинским учреждениям и специалистам в более раннем выявлении пациентов с высоким риском.\n",
+ "### 2. Снижение затрат на лечение: Предупреждение инсультов у пациентов позволит снизить затраты на лечение и реабилитацию. Это также поможет улучшить качество медицинских услуг и повысить удовлетворенность пациентов.\n",
+ "### 3. Повышение эффективности профилактики: Выявление факторов риска инсульта на ранней стадии может способствовать более эффективному проведению профилактических мероприятий.\n",
+ "## Цели технического проекта:\n",
+ "### 1. Создание и обучение модели машинного обучения: Разработка модели, способной предсказать вероятность инсульта на основе данных о пациентах (например, возраст, уровень глюкозы, наличие сердечно-сосудистых заболеваний, тип работы, индекс массы тела и т.д.).\n",
+ "### 2. Анализ и обработка данных: Провести предобработку данных (очистка, заполнение пропущенных значений, кодирование категориальных признаков), чтобы улучшить качество и надежность модели.\n",
+ "### 3. Оценка модели: Использовать метрики, такие как точность, полнота и F1-мера, чтобы оценить эффективность модели и минимизировать риск ложных положительных и ложных отрицательных предсказаний."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "id 0\n",
+ "gender 0\n",
+ "age 0\n",
+ "hypertension 0\n",
+ "heart_disease 0\n",
+ "ever_married 0\n",
+ "work_type 0\n",
+ "Residence_type 0\n",
+ "avg_glucose_level 0\n",
+ "bmi 201\n",
+ "smoking_status 0\n",
+ "stroke 0\n",
+ "dtype: int64\n",
+ "\n",
+ "id False\n",
+ "gender False\n",
+ "age False\n",
+ "hypertension False\n",
+ "heart_disease False\n",
+ "ever_married False\n",
+ "work_type False\n",
+ "Residence_type False\n",
+ "avg_glucose_level False\n",
+ "bmi True\n",
+ "smoking_status False\n",
+ "stroke False\n",
+ "dtype: bool\n",
+ "\n",
+ "bmi процент пустых значений: %3.93\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(df.isnull().sum())\n",
+ "print()\n",
+ "\n",
+ "print(df.isnull().any())\n",
+ "print()\n",
+ "\n",
+ "for i in df.columns:\n",
+ " null_rate = df[i].isnull().sum() / len(df) * 100\n",
+ " if null_rate > 0:\n",
+ " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Видим пустые значения в bmi, заменяем их"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Количество пустых значений в каждом столбце после замены:\n",
+ "id 0\n",
+ "gender 0\n",
+ "age 0\n",
+ "hypertension 0\n",
+ "heart_disease 0\n",
+ "ever_married 0\n",
+ "work_type 0\n",
+ "Residence_type 0\n",
+ "avg_glucose_level 0\n",
+ "bmi 0\n",
+ "smoking_status 0\n",
+ "stroke 0\n",
+ "dtype: int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "df[\"bmi\"] = df[\"bmi\"].fillna(df[\"bmi\"].median())\n",
+ "\n",
+ "missing_values = df.isnull().sum()\n",
+ "\n",
+ "print(\"Количество пустых значений в каждом столбце после замены:\")\n",
+ "print(missing_values)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',\n",
+ " 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',\n",
+ " 'smoking_status', 'stroke'],\n",
+ " dtype='object')\n"
+ ]
+ }
+ ],
+ "source": [
+ "df = df.drop('id', axis = 1)\n",
+ "print(df.columns)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Создаем выборки"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Размер обучающей выборки: (2503, 10)\n",
+ "Размер контрольной выборки: (1074, 10)\n",
+ "Размер тестовой выборки: (1533, 10)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Разделим данные на признак (X) и переменую (Y)\n",
+ "# Начнем со stroke\n",
+ "X = df.drop(columns=['stroke'])\n",
+ "y = df['stroke']\n",
+ "\n",
+ "# Разбиваем на обучающую и тестовую выборки\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n",
+ "\n",
+ "# Разбиваем на обучающую и контрольную выборки\n",
+ "X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3)\n",
+ "\n",
+ "print(\"Размер обучающей выборки: \", X_train.shape)\n",
+ "print(\"Размер контрольной выборки: \", X_val.shape)\n",
+ "print(\"Размер тестовой выборки: \", X_test.shape)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Оценим сбалансированность сборок"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Распределение классов в обучающей выборке:\n",
+ "stroke\n",
+ "0 0.951658\n",
+ "1 0.048342\n",
+ "Name: proportion, dtype: float64\n",
+ "\n",
+ "Распределение классов в контрольной выборке:\n",
+ "stroke\n",
+ "0 0.947858\n",
+ "1 0.052142\n",
+ "Name: proportion, dtype: float64\n",
+ "\n",
+ "Распределение классов в тестовой выборке:\n",
+ "stroke\n",
+ "0 0.953033\n",
+ "1 0.046967\n",
+ "Name: proportion, dtype: float64\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "