cloned lec2

2024-09-28 13:49:26 +03:00 · 2024-09-28 13:49:26 +03:00 · ce8a562bf1
commit ce8a562bf1
parent 405bc0e297
1 changed files with 935 additions and 0 deletions
--- a/lab_2/lec2.ipynb
+++ b/lab_2/lec2.ipynb
@ -0,0 +1,935 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Загрузка данных в DataFrame"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<>:3: SyntaxWarning: invalid escape sequence '\\c'\n",
+      "<>:3: SyntaxWarning: invalid escape sequence '\\c'\n",
+      "C:\\Users\\New\\AppData\\Local\\Temp\\ipykernel_9568\\2466488670.py:3: SyntaxWarning: invalid escape sequence '\\c'\n",
+      "  df = pd.read_csv(\"static\\csv\\Forbes Billionaires.csv\", index_col=\"PassengerId\")\n",
+      "C:\\Users\\New\\AppData\\Local\\Temp\\ipykernel_9568\\2466488670.py:3: SyntaxWarning: invalid escape sequence '\\c'\n",
+      "  df = pd.read_csv(\"static\\csv\\Forbes Billionaires.csv\", index_col=\"PassengerId\")\n"
+     ]
+    },
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: 'static\\\\csv\\\\Forbes Billionaires.csv'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[2], line 3\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstatic\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43mcsv\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43mForbes Billionaires.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPassengerId\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m      5\u001b[0m df\u001b[38;5;241m.\u001b[39minfo()\n\u001b[0;32m      7\u001b[0m \u001b[38;5;28mprint\u001b[39m(df\u001b[38;5;241m.\u001b[39mshape)\n",
+      "File \u001b[1;32md:\\5semestr\\AIM\\aimvenv\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[0;32m   1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[0;32m   1014\u001b[0m     dialect,\n\u001b[0;32m   1015\u001b[0m     delimiter,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   1022\u001b[0m     dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[0;32m   1023\u001b[0m )\n\u001b[0;32m   1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[1;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32md:\\5semestr\\AIM\\aimvenv\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m    617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[0;32m    619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[1;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[0;32m    623\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
+      "File \u001b[1;32md:\\5semestr\\AIM\\aimvenv\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m   1617\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m   1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32md:\\5semestr\\AIM\\aimvenv\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1880\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[1;34m(self, f, engine)\u001b[0m\n\u001b[0;32m   1878\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[0;32m   1879\u001b[0m         mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m-> 1880\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1881\u001b[0m \u001b[43m    \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1882\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1883\u001b[0m \u001b[43m    \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1884\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1885\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1886\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1887\u001b[0m \u001b[43m    \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1888\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1889\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1890\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m   1891\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n",
+      "File \u001b[1;32md:\\5semestr\\AIM\\aimvenv\\Lib\\site-packages\\pandas\\io\\common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m    868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m    869\u001b[0m     \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[0;32m    870\u001b[0m     \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[0;32m    871\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[0;32m    872\u001b[0m         \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[1;32m--> 873\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[0;32m    874\u001b[0m \u001b[43m            \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    875\u001b[0m \u001b[43m            \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    876\u001b[0m \u001b[43m            \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    877\u001b[0m \u001b[43m            \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    878\u001b[0m \u001b[43m            \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m    879\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    880\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m    881\u001b[0m         \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[0;32m    882\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
+      "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'static\\\\csv\\\\Forbes Billionaires.csv'"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.read_csv(\"static\\csv\\Forbes Billionaires.csv\", index_col=\"PassengerId\")\n",
+    "\n",
+    "df.info()\n",
+    "\n",
+    "print(df.shape)\n",
+    "\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Получение сведений о пропущенных данных"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Типы пропущенных данных:\n",
+    "- None - представление пустых данных в Python\n",
+    "- NaN - представление пустых данных в Pandas\n",
+    "- '' - пустая строка"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Survived      0\n",
+      "Pclass        0\n",
+      "Name          0\n",
+      "Sex           0\n",
+      "Age         177\n",
+      "SibSp         0\n",
+      "Parch         0\n",
+      "Ticket        0\n",
+      "Fare          0\n",
+      "Cabin       687\n",
+      "Embarked      2\n",
+      "dtype: int64\n",
+      "\n",
+      "Survived    False\n",
+      "Pclass      False\n",
+      "Name        False\n",
+      "Sex         False\n",
+      "Age          True\n",
+      "SibSp       False\n",
+      "Parch       False\n",
+      "Ticket      False\n",
+      "Fare        False\n",
+      "Cabin        True\n",
+      "Embarked     True\n",
+      "dtype: bool\n",
+      "\n",
+      "Age процент пустых значений: %19.87\n",
+      "Cabin процент пустых значений: %77.10\n",
+      "Embarked процент пустых значений: %0.22\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Количество пустых значений признаков\n",
+    "print(df.isnull().sum())\n",
+    "\n",
+    "print()\n",
+    "\n",
+    "# Есть ли пустые значения признаков\n",
+    "print(df.isnull().any())\n",
+    "\n",
+    "print()\n",
+    "\n",
+    "# Процент пустых значений признаков\n",
+    "for i in df.columns:\n",
+    "    null_rate = df[i].isnull().sum() / len(df) * 100\n",
+    "    if null_rate > 0:\n",
+    "        print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Заполнение пропущенных данных\n",
+    "\n",
+    "https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n",
+    "\n",
+    "https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(891, 11)\n",
+      "Survived    False\n",
+      "Pclass      False\n",
+      "Name        False\n",
+      "Sex         False\n",
+      "Age         False\n",
+      "SibSp       False\n",
+      "Parch       False\n",
+      "Ticket      False\n",
+      "Fare        False\n",
+      "Cabin       False\n",
+      "Embarked    False\n",
+      "dtype: bool\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Survived</th>\n",
+       "      <th>Pclass</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Sex</th>\n",
+       "      <th>Age</th>\n",
+       "      <th>SibSp</th>\n",
+       "      <th>Parch</th>\n",
+       "      <th>Ticket</th>\n",
+       "      <th>Fare</th>\n",
+       "      <th>Cabin</th>\n",
+       "      <th>Embarked</th>\n",
+       "      <th>AgeFillNA</th>\n",
+       "      <th>AgeFillMedian</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>PassengerId</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>887</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>Montvila, Rev. Juozas</td>\n",
+       "      <td>male</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>211536</td>\n",
+       "      <td>13.00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>S</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>27.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>888</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Graham, Miss. Margaret Edith</td>\n",
+       "      <td>female</td>\n",
+       "      <td>19.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>112053</td>\n",
+       "      <td>30.00</td>\n",
+       "      <td>B42</td>\n",
+       "      <td>S</td>\n",
+       "      <td>19.0</td>\n",
+       "      <td>19.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>889</th>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
+       "      <td>female</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>W./C. 6607</td>\n",
+       "      <td>23.45</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>S</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>28.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>890</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Behr, Mr. Karl Howell</td>\n",
+       "      <td>male</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>111369</td>\n",
+       "      <td>30.00</td>\n",
+       "      <td>C148</td>\n",
+       "      <td>C</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>26.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>891</th>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>Dooley, Mr. Patrick</td>\n",
+       "      <td>male</td>\n",
+       "      <td>32.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>370376</td>\n",
+       "      <td>7.75</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Q</td>\n",
+       "      <td>32.0</td>\n",
+       "      <td>32.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             Survived  Pclass                                      Name  \\\n",
+       "PassengerId                                                               \n",
+       "887                 0       2                     Montvila, Rev. Juozas   \n",
+       "888                 1       1              Graham, Miss. Margaret Edith   \n",
+       "889                 0       3  Johnston, Miss. Catherine Helen \"Carrie\"   \n",
+       "890                 1       1                     Behr, Mr. Karl Howell   \n",
+       "891                 0       3                       Dooley, Mr. Patrick   \n",
+       "\n",
+       "                Sex   Age  SibSp  Parch      Ticket   Fare Cabin Embarked  \\\n",
+       "PassengerId                                                                 \n",
+       "887            male  27.0      0      0      211536  13.00   NaN        S   \n",
+       "888          female  19.0      0      0      112053  30.00   B42        S   \n",
+       "889          female   NaN      1      2  W./C. 6607  23.45   NaN        S   \n",
+       "890            male  26.0      0      0      111369  30.00  C148        C   \n",
+       "891            male  32.0      0      0      370376   7.75   NaN        Q   \n",
+       "\n",
+       "             AgeFillNA  AgeFillMedian  \n",
+       "PassengerId                            \n",
+       "887               27.0           27.0  \n",
+       "888               19.0           19.0  \n",
+       "889                0.0           28.0  \n",
+       "890               26.0           26.0  \n",
+       "891               32.0           32.0  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fillna_df = df.fillna(0)\n",
+    "\n",
+    "print(fillna_df.shape)\n",
+    "\n",
+    "print(fillna_df.isnull().any())\n",
+    "\n",
+    "# Замена пустых данных на 0\n",
+    "df[\"AgeFillNA\"] = df[\"Age\"].fillna(0)\n",
+    "\n",
+    "# Замена пустых данных на медиану\n",
+    "df[\"AgeFillMedian\"] = df[\"Age\"].fillna(df[\"Age\"].median())\n",
+    "\n",
+    "df.tail()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Survived</th>\n",
+       "      <th>Pclass</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Sex</th>\n",
+       "      <th>Age</th>\n",
+       "      <th>SibSp</th>\n",
+       "      <th>Parch</th>\n",
+       "      <th>Ticket</th>\n",
+       "      <th>Fare</th>\n",
+       "      <th>Cabin</th>\n",
+       "      <th>Embarked</th>\n",
+       "      <th>AgeFillNA</th>\n",
+       "      <th>AgeFillMedian</th>\n",
+       "      <th>AgeCopy</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>PassengerId</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>887</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>Montvila, Rev. Juozas</td>\n",
+       "      <td>male</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>211536</td>\n",
+       "      <td>13.00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>S</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>27.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>888</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Graham, Miss. Margaret Edith</td>\n",
+       "      <td>female</td>\n",
+       "      <td>19.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>112053</td>\n",
+       "      <td>30.00</td>\n",
+       "      <td>B42</td>\n",
+       "      <td>S</td>\n",
+       "      <td>19.0</td>\n",
+       "      <td>19.0</td>\n",
+       "      <td>19.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>889</th>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
+       "      <td>female</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>W./C. 6607</td>\n",
+       "      <td>23.45</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>S</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>28.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>890</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Behr, Mr. Karl Howell</td>\n",
+       "      <td>male</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>111369</td>\n",
+       "      <td>30.00</td>\n",
+       "      <td>C148</td>\n",
+       "      <td>C</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>26.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>891</th>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>Dooley, Mr. Patrick</td>\n",
+       "      <td>male</td>\n",
+       "      <td>32.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>370376</td>\n",
+       "      <td>7.75</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Q</td>\n",
+       "      <td>32.0</td>\n",
+       "      <td>32.0</td>\n",
+       "      <td>32.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             Survived  Pclass                                      Name  \\\n",
+       "PassengerId                                                               \n",
+       "887                 0       2                     Montvila, Rev. Juozas   \n",
+       "888                 1       1              Graham, Miss. Margaret Edith   \n",
+       "889                 0       3  Johnston, Miss. Catherine Helen \"Carrie\"   \n",
+       "890                 1       1                     Behr, Mr. Karl Howell   \n",
+       "891                 0       3                       Dooley, Mr. Patrick   \n",
+       "\n",
+       "                Sex   Age  SibSp  Parch      Ticket   Fare Cabin Embarked  \\\n",
+       "PassengerId                                                                 \n",
+       "887            male  27.0      0      0      211536  13.00   NaN        S   \n",
+       "888          female  19.0      0      0      112053  30.00   B42        S   \n",
+       "889          female   NaN      1      2  W./C. 6607  23.45   NaN        S   \n",
+       "890            male  26.0      0      0      111369  30.00  C148        C   \n",
+       "891            male  32.0      0      0      370376   7.75   NaN        Q   \n",
+       "\n",
+       "             AgeFillNA  AgeFillMedian  AgeCopy  \n",
+       "PassengerId                                     \n",
+       "887               27.0           27.0     27.0  \n",
+       "888               19.0           19.0     19.0  \n",
+       "889                0.0           28.0      0.0  \n",
+       "890               26.0           26.0     26.0  \n",
+       "891               32.0           32.0     32.0  "
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[\"AgeCopy\"] = df[\"Age\"]\n",
+    "\n",
+    "# Замена данных сразу в DataFrame без копирования\n",
+    "df.fillna({\"AgeCopy\": 0}, inplace=True)\n",
+    "\n",
+    "df.tail()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Удаление наблюдений с пропусками"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(183, 14)\n",
+      "Survived    False\n",
+      "Pclass      False\n",
+      "Name        False\n",
+      "Sex         False\n",
+      "Age         False\n",
+      "SibSp       False\n",
+      "Parch       False\n",
+      "Ticket      False\n",
+      "Fare        False\n",
+      "Cabin       False\n",
+      "Embarked    False\n",
+      "dtype: bool\n"
+     ]
+    }
+   ],
+   "source": [
+    "dropna_df = df.dropna()\n",
+    "\n",
+    "print(dropna_df.shape)\n",
+    "\n",
+    "print(fillna_df.isnull().any())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Создание выборок данных\n",
+    "\n",
+    "Библиотека scikit-learn\n",
+    "\n",
+    "https://scikit-learn.org/stable/index.html"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<img src=\"assets/lec2-split.png\" width=\"600\" style=\"background-color: white\">"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Функция для создания выборок\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "\n",
+    "def split_stratified_into_train_val_test(\n",
+    "    df_input,\n",
+    "    stratify_colname=\"y\",\n",
+    "    frac_train=0.6,\n",
+    "    frac_val=0.15,\n",
+    "    frac_test=0.25,\n",
+    "    random_state=None,\n",
+    "):\n",
+    "    \"\"\"\n",
+    "    Splits a Pandas dataframe into three subsets (train, val, and test)\n",
+    "    following fractional ratios provided by the user, where each subset is\n",
+    "    stratified by the values in a specific column (that is, each subset has\n",
+    "    the same relative frequency of the values in the column). It performs this\n",
+    "    splitting by running train_test_split() twice.\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    df_input : Pandas dataframe\n",
+    "        Input dataframe to be split.\n",
+    "    stratify_colname : str\n",
+    "        The name of the column that will be used for stratification. Usually\n",
+    "        this column would be for the label.\n",
+    "    frac_train : float\n",
+    "    frac_val   : float\n",
+    "    frac_test  : float\n",
+    "        The ratios with which the dataframe will be split into train, val, and\n",
+    "        test data. The values should be expressed as float fractions and should\n",
+    "        sum to 1.0.\n",
+    "    random_state : int, None, or RandomStateInstance\n",
+    "        Value to be passed to train_test_split().\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    df_train, df_val, df_test :\n",
+    "        Dataframes containing the three splits.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    if frac_train + frac_val + frac_test != 1.0:\n",
+    "        raise ValueError(\n",
+    "            \"fractions %f, %f, %f do not add up to 1.0\"\n",
+    "            % (frac_train, frac_val, frac_test)\n",
+    "        )\n",
+    "\n",
+    "    if stratify_colname not in df_input.columns:\n",
+    "        raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
+    "\n",
+    "    X = df_input  # Contains all columns.\n",
+    "    y = df_input[\n",
+    "        [stratify_colname]\n",
+    "    ]  # Dataframe of just the column on which to stratify.\n",
+    "\n",
+    "    # Split original dataframe into train and temp dataframes.\n",
+    "    df_train, df_temp, y_train, y_temp = train_test_split(\n",
+    "        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
+    "    )\n",
+    "\n",
+    "    # Split the temp dataframe into val and test dataframes.\n",
+    "    relative_frac_test = frac_test / (frac_val + frac_test)\n",
+    "    df_val, df_test, y_val, y_test = train_test_split(\n",
+    "        df_temp,\n",
+    "        y_temp,\n",
+    "        stratify=y_temp,\n",
+    "        test_size=relative_frac_test,\n",
+    "        random_state=random_state,\n",
+    "    )\n",
+    "\n",
+    "    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
+    "\n",
+    "    return df_train, df_val, df_test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pclass\n",
+      "3    491\n",
+      "1    216\n",
+      "2    184\n",
+      "Name: count, dtype: int64\n",
+      "Обучающая выборка:  (534, 3)\n",
+      "Pclass\n",
+      "3    294\n",
+      "1    130\n",
+      "2    110\n",
+      "Name: count, dtype: int64\n",
+      "Контрольная выборка:  (178, 3)\n",
+      "Pclass\n",
+      "3    98\n",
+      "1    43\n",
+      "2    37\n",
+      "Name: count, dtype: int64\n",
+      "Тестовая выборка:  (179, 3)\n",
+      "Pclass\n",
+      "3    99\n",
+      "1    43\n",
+      "2    37\n",
+      "Name: count, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Вывод распределения количества наблюдений по меткам (классам)\n",
+    "print(df.Pclass.value_counts())\n",
+    "\n",
+    "data = df[[\"Pclass\", \"Survived\", \"AgeFillMedian\"]].copy()\n",
+    "\n",
+    "df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
+    "   data, stratify_colname=\"Pclass\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
+    ")\n",
+    "\n",
+    "print(\"Обучающая выборка: \", df_train.shape)\n",
+    "print(df_train.Pclass.value_counts())\n",
+    "\n",
+    "print(\"Контрольная выборка: \", df_val.shape)\n",
+    "print(df_val.Pclass.value_counts())\n",
+    "\n",
+    "print(\"Тестовая выборка: \", df_test.shape)\n",
+    "print(df_test.Pclass.value_counts())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Выборка с избытком (oversampling)\n",
+    "\n",
+    "https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n",
+    "\n",
+    "https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n",
+    "\n",
+    "Выборка с недостатком (undersampling)\n",
+    "\n",
+    "https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n",
+    "\n",
+    "Библиотека imbalanced-learn\n",
+    "\n",
+    "https://imbalanced-learn.org/stable/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Обучающая выборка:  (534, 3)\n",
+      "Pclass\n",
+      "3    294\n",
+      "1    130\n",
+      "2    110\n",
+      "Name: count, dtype: int64\n",
+      "Обучающая выборка после oversampling:  (864, 3)\n",
+      "Pclass\n",
+      "3    294\n",
+      "2    290\n",
+      "1    280\n",
+      "Name: count, dtype: int64\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Pclass</th>\n",
+       "      <th>Survived</th>\n",
+       "      <th>AgeFillMedian</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>28.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>32.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>1</td>\n",
+       "      <td>28.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>45.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>7.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>859</th>\n",
+       "      <td>2</td>\n",
+       "      <td>0</td>\n",
+       "      <td>26.887761</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>860</th>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.890459</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>861</th>\n",
+       "      <td>2</td>\n",
+       "      <td>0</td>\n",
+       "      <td>17.481437</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>862</th>\n",
+       "      <td>2</td>\n",
+       "      <td>0</td>\n",
+       "      <td>17.078473</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>863</th>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "      <td>17.220445</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>864 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     Pclass  Survived  AgeFillMedian\n",
+       "0         3         0      28.000000\n",
+       "1         3         0      32.000000\n",
+       "2         3         1      28.000000\n",
+       "3         1         0      45.000000\n",
+       "4         3         0       7.000000\n",
+       "..      ...       ...            ...\n",
+       "859       2         0      26.887761\n",
+       "860       2         1       0.890459\n",
+       "861       2         0      17.481437\n",
+       "862       2         0      17.078473\n",
+       "863       2         1      17.220445\n",
+       "\n",
+       "[864 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from imblearn.over_sampling import ADASYN\n",
+    "\n",
+    "ada = ADASYN()\n",
+    "\n",
+    "print(\"Обучающая выборка: \", df_train.shape)\n",
+    "print(df_train.Pclass.value_counts())\n",
+    "\n",
+    "X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Pclass\"])\n",
+    "df_train_adasyn = pd.DataFrame(X_resampled)\n",
+    "\n",
+    "print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
+    "print(df_train_adasyn.Pclass.value_counts())\n",
+    "\n",
+    "df_train_adasyn"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}