diff --git a/lab_2/lec2.ipynb b/lab_2/lec2.ipynb
new file mode 100644
index 0000000..8388c3d
--- /dev/null
+++ b/lab_2/lec2.ipynb
@@ -0,0 +1,935 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Загрузка данных в DataFrame"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "<>:3: SyntaxWarning: invalid escape sequence '\\c'\n",
+ "<>:3: SyntaxWarning: invalid escape sequence '\\c'\n",
+ "C:\\Users\\New\\AppData\\Local\\Temp\\ipykernel_9568\\2466488670.py:3: SyntaxWarning: invalid escape sequence '\\c'\n",
+ " df = pd.read_csv(\"static\\csv\\Forbes Billionaires.csv\", index_col=\"PassengerId\")\n",
+ "C:\\Users\\New\\AppData\\Local\\Temp\\ipykernel_9568\\2466488670.py:3: SyntaxWarning: invalid escape sequence '\\c'\n",
+ " df = pd.read_csv(\"static\\csv\\Forbes Billionaires.csv\", index_col=\"PassengerId\")\n"
+ ]
+ },
+ {
+ "ename": "FileNotFoundError",
+ "evalue": "[Errno 2] No such file or directory: 'static\\\\csv\\\\Forbes Billionaires.csv'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[1;32mIn[2], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstatic\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43mcsv\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43mForbes Billionaires.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPassengerId\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 5\u001b[0m df\u001b[38;5;241m.\u001b[39minfo()\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28mprint\u001b[39m(df\u001b[38;5;241m.\u001b[39mshape)\n",
+ "File \u001b[1;32md:\\5semestr\\AIM\\aimvenv\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[0;32m 1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[0;32m 1014\u001b[0m dialect,\n\u001b[0;32m 1015\u001b[0m delimiter,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1022\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[0;32m 1023\u001b[0m )\n\u001b[0;32m 1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[1;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[1;32md:\\5semestr\\AIM\\aimvenv\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[0;32m 619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[1;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[0;32m 623\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
+ "File \u001b[1;32md:\\5semestr\\AIM\\aimvenv\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m 1617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[1;32md:\\5semestr\\AIM\\aimvenv\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1880\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[1;34m(self, f, engine)\u001b[0m\n\u001b[0;32m 1878\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[0;32m 1879\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m-> 1880\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1881\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1882\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1883\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1884\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1885\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1886\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1887\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1888\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1889\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1890\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 1891\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n",
+ "File \u001b[1;32md:\\5semestr\\AIM\\aimvenv\\Lib\\site-packages\\pandas\\io\\common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m 868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m 869\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[0;32m 870\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[0;32m 871\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[0;32m 872\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[1;32m--> 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[0;32m 874\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 875\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 876\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 877\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 878\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 879\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[0;32m 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
+ "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'static\\\\csv\\\\Forbes Billionaires.csv'"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "df = pd.read_csv(\"static\\csv\\Forbes Billionaires.csv\", index_col=\"PassengerId\")\n",
+ "\n",
+ "df.info()\n",
+ "\n",
+ "print(df.shape)\n",
+ "\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Получение сведений о пропущенных данных"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Типы пропущенных данных:\n",
+ "- None - представление пустых данных в Python\n",
+ "- NaN - представление пустых данных в Pandas\n",
+ "- '' - пустая строка"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Survived 0\n",
+ "Pclass 0\n",
+ "Name 0\n",
+ "Sex 0\n",
+ "Age 177\n",
+ "SibSp 0\n",
+ "Parch 0\n",
+ "Ticket 0\n",
+ "Fare 0\n",
+ "Cabin 687\n",
+ "Embarked 2\n",
+ "dtype: int64\n",
+ "\n",
+ "Survived False\n",
+ "Pclass False\n",
+ "Name False\n",
+ "Sex False\n",
+ "Age True\n",
+ "SibSp False\n",
+ "Parch False\n",
+ "Ticket False\n",
+ "Fare False\n",
+ "Cabin True\n",
+ "Embarked True\n",
+ "dtype: bool\n",
+ "\n",
+ "Age процент пустых значений: %19.87\n",
+ "Cabin процент пустых значений: %77.10\n",
+ "Embarked процент пустых значений: %0.22\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Количество пустых значений признаков\n",
+ "print(df.isnull().sum())\n",
+ "\n",
+ "print()\n",
+ "\n",
+ "# Есть ли пустые значения признаков\n",
+ "print(df.isnull().any())\n",
+ "\n",
+ "print()\n",
+ "\n",
+ "# Процент пустых значений признаков\n",
+ "for i in df.columns:\n",
+ " null_rate = df[i].isnull().sum() / len(df) * 100\n",
+ " if null_rate > 0:\n",
+ " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Заполнение пропущенных данных\n",
+ "\n",
+ "https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n",
+ "\n",
+ "https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(891, 11)\n",
+ "Survived False\n",
+ "Pclass False\n",
+ "Name False\n",
+ "Sex False\n",
+ "Age False\n",
+ "SibSp False\n",
+ "Parch False\n",
+ "Ticket False\n",
+ "Fare False\n",
+ "Cabin False\n",
+ "Embarked False\n",
+ "dtype: bool\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Survived | \n",
+ " Pclass | \n",
+ " Name | \n",
+ " Sex | \n",
+ " Age | \n",
+ " SibSp | \n",
+ " Parch | \n",
+ " Ticket | \n",
+ " Fare | \n",
+ " Cabin | \n",
+ " Embarked | \n",
+ " AgeFillNA | \n",
+ " AgeFillMedian | \n",
+ "
\n",
+ " \n",
+ " PassengerId | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 887 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " Montvila, Rev. Juozas | \n",
+ " male | \n",
+ " 27.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 211536 | \n",
+ " 13.00 | \n",
+ " NaN | \n",
+ " S | \n",
+ " 27.0 | \n",
+ " 27.0 | \n",
+ "
\n",
+ " \n",
+ " 888 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Graham, Miss. Margaret Edith | \n",
+ " female | \n",
+ " 19.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 112053 | \n",
+ " 30.00 | \n",
+ " B42 | \n",
+ " S | \n",
+ " 19.0 | \n",
+ " 19.0 | \n",
+ "
\n",
+ " \n",
+ " 889 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Johnston, Miss. Catherine Helen \"Carrie\" | \n",
+ " female | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " W./C. 6607 | \n",
+ " 23.45 | \n",
+ " NaN | \n",
+ " S | \n",
+ " 0.0 | \n",
+ " 28.0 | \n",
+ "
\n",
+ " \n",
+ " 890 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Behr, Mr. Karl Howell | \n",
+ " male | \n",
+ " 26.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 111369 | \n",
+ " 30.00 | \n",
+ " C148 | \n",
+ " C | \n",
+ " 26.0 | \n",
+ " 26.0 | \n",
+ "
\n",
+ " \n",
+ " 891 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Dooley, Mr. Patrick | \n",
+ " male | \n",
+ " 32.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 370376 | \n",
+ " 7.75 | \n",
+ " NaN | \n",
+ " Q | \n",
+ " 32.0 | \n",
+ " 32.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Survived Pclass Name \\\n",
+ "PassengerId \n",
+ "887 0 2 Montvila, Rev. Juozas \n",
+ "888 1 1 Graham, Miss. Margaret Edith \n",
+ "889 0 3 Johnston, Miss. Catherine Helen \"Carrie\" \n",
+ "890 1 1 Behr, Mr. Karl Howell \n",
+ "891 0 3 Dooley, Mr. Patrick \n",
+ "\n",
+ " Sex Age SibSp Parch Ticket Fare Cabin Embarked \\\n",
+ "PassengerId \n",
+ "887 male 27.0 0 0 211536 13.00 NaN S \n",
+ "888 female 19.0 0 0 112053 30.00 B42 S \n",
+ "889 female NaN 1 2 W./C. 6607 23.45 NaN S \n",
+ "890 male 26.0 0 0 111369 30.00 C148 C \n",
+ "891 male 32.0 0 0 370376 7.75 NaN Q \n",
+ "\n",
+ " AgeFillNA AgeFillMedian \n",
+ "PassengerId \n",
+ "887 27.0 27.0 \n",
+ "888 19.0 19.0 \n",
+ "889 0.0 28.0 \n",
+ "890 26.0 26.0 \n",
+ "891 32.0 32.0 "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "fillna_df = df.fillna(0)\n",
+ "\n",
+ "print(fillna_df.shape)\n",
+ "\n",
+ "print(fillna_df.isnull().any())\n",
+ "\n",
+ "# Замена пустых данных на 0\n",
+ "df[\"AgeFillNA\"] = df[\"Age\"].fillna(0)\n",
+ "\n",
+ "# Замена пустых данных на медиану\n",
+ "df[\"AgeFillMedian\"] = df[\"Age\"].fillna(df[\"Age\"].median())\n",
+ "\n",
+ "df.tail()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Survived | \n",
+ " Pclass | \n",
+ " Name | \n",
+ " Sex | \n",
+ " Age | \n",
+ " SibSp | \n",
+ " Parch | \n",
+ " Ticket | \n",
+ " Fare | \n",
+ " Cabin | \n",
+ " Embarked | \n",
+ " AgeFillNA | \n",
+ " AgeFillMedian | \n",
+ " AgeCopy | \n",
+ "
\n",
+ " \n",
+ " PassengerId | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 887 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " Montvila, Rev. Juozas | \n",
+ " male | \n",
+ " 27.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 211536 | \n",
+ " 13.00 | \n",
+ " NaN | \n",
+ " S | \n",
+ " 27.0 | \n",
+ " 27.0 | \n",
+ " 27.0 | \n",
+ "
\n",
+ " \n",
+ " 888 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Graham, Miss. Margaret Edith | \n",
+ " female | \n",
+ " 19.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 112053 | \n",
+ " 30.00 | \n",
+ " B42 | \n",
+ " S | \n",
+ " 19.0 | \n",
+ " 19.0 | \n",
+ " 19.0 | \n",
+ "
\n",
+ " \n",
+ " 889 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Johnston, Miss. Catherine Helen \"Carrie\" | \n",
+ " female | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " W./C. 6607 | \n",
+ " 23.45 | \n",
+ " NaN | \n",
+ " S | \n",
+ " 0.0 | \n",
+ " 28.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 890 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Behr, Mr. Karl Howell | \n",
+ " male | \n",
+ " 26.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 111369 | \n",
+ " 30.00 | \n",
+ " C148 | \n",
+ " C | \n",
+ " 26.0 | \n",
+ " 26.0 | \n",
+ " 26.0 | \n",
+ "
\n",
+ " \n",
+ " 891 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Dooley, Mr. Patrick | \n",
+ " male | \n",
+ " 32.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 370376 | \n",
+ " 7.75 | \n",
+ " NaN | \n",
+ " Q | \n",
+ " 32.0 | \n",
+ " 32.0 | \n",
+ " 32.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Survived Pclass Name \\\n",
+ "PassengerId \n",
+ "887 0 2 Montvila, Rev. Juozas \n",
+ "888 1 1 Graham, Miss. Margaret Edith \n",
+ "889 0 3 Johnston, Miss. Catherine Helen \"Carrie\" \n",
+ "890 1 1 Behr, Mr. Karl Howell \n",
+ "891 0 3 Dooley, Mr. Patrick \n",
+ "\n",
+ " Sex Age SibSp Parch Ticket Fare Cabin Embarked \\\n",
+ "PassengerId \n",
+ "887 male 27.0 0 0 211536 13.00 NaN S \n",
+ "888 female 19.0 0 0 112053 30.00 B42 S \n",
+ "889 female NaN 1 2 W./C. 6607 23.45 NaN S \n",
+ "890 male 26.0 0 0 111369 30.00 C148 C \n",
+ "891 male 32.0 0 0 370376 7.75 NaN Q \n",
+ "\n",
+ " AgeFillNA AgeFillMedian AgeCopy \n",
+ "PassengerId \n",
+ "887 27.0 27.0 27.0 \n",
+ "888 19.0 19.0 19.0 \n",
+ "889 0.0 28.0 0.0 \n",
+ "890 26.0 26.0 26.0 \n",
+ "891 32.0 32.0 32.0 "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[\"AgeCopy\"] = df[\"Age\"]\n",
+ "\n",
+ "# Замена данных сразу в DataFrame без копирования\n",
+ "df.fillna({\"AgeCopy\": 0}, inplace=True)\n",
+ "\n",
+ "df.tail()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Удаление наблюдений с пропусками"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(183, 14)\n",
+ "Survived False\n",
+ "Pclass False\n",
+ "Name False\n",
+ "Sex False\n",
+ "Age False\n",
+ "SibSp False\n",
+ "Parch False\n",
+ "Ticket False\n",
+ "Fare False\n",
+ "Cabin False\n",
+ "Embarked False\n",
+ "dtype: bool\n"
+ ]
+ }
+ ],
+ "source": [
+ "dropna_df = df.dropna()\n",
+ "\n",
+ "print(dropna_df.shape)\n",
+ "\n",
+ "print(fillna_df.isnull().any())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Создание выборок данных\n",
+ "\n",
+ "Библиотека scikit-learn\n",
+ "\n",
+ "https://scikit-learn.org/stable/index.html"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Функция для создания выборок\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "\n",
+ "def split_stratified_into_train_val_test(\n",
+ " df_input,\n",
+ " stratify_colname=\"y\",\n",
+ " frac_train=0.6,\n",
+ " frac_val=0.15,\n",
+ " frac_test=0.25,\n",
+ " random_state=None,\n",
+ "):\n",
+ " \"\"\"\n",
+ " Splits a Pandas dataframe into three subsets (train, val, and test)\n",
+ " following fractional ratios provided by the user, where each subset is\n",
+ " stratified by the values in a specific column (that is, each subset has\n",
+ " the same relative frequency of the values in the column). It performs this\n",
+ " splitting by running train_test_split() twice.\n",
+ "\n",
+ " Parameters\n",
+ " ----------\n",
+ " df_input : Pandas dataframe\n",
+ " Input dataframe to be split.\n",
+ " stratify_colname : str\n",
+ " The name of the column that will be used for stratification. Usually\n",
+ " this column would be for the label.\n",
+ " frac_train : float\n",
+ " frac_val : float\n",
+ " frac_test : float\n",
+ " The ratios with which the dataframe will be split into train, val, and\n",
+ " test data. The values should be expressed as float fractions and should\n",
+ " sum to 1.0.\n",
+ " random_state : int, None, or RandomStateInstance\n",
+ " Value to be passed to train_test_split().\n",
+ "\n",
+ " Returns\n",
+ " -------\n",
+ " df_train, df_val, df_test :\n",
+ " Dataframes containing the three splits.\n",
+ " \"\"\"\n",
+ "\n",
+ " if frac_train + frac_val + frac_test != 1.0:\n",
+ " raise ValueError(\n",
+ " \"fractions %f, %f, %f do not add up to 1.0\"\n",
+ " % (frac_train, frac_val, frac_test)\n",
+ " )\n",
+ "\n",
+ " if stratify_colname not in df_input.columns:\n",
+ " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
+ "\n",
+ " X = df_input # Contains all columns.\n",
+ " y = df_input[\n",
+ " [stratify_colname]\n",
+ " ] # Dataframe of just the column on which to stratify.\n",
+ "\n",
+ " # Split original dataframe into train and temp dataframes.\n",
+ " df_train, df_temp, y_train, y_temp = train_test_split(\n",
+ " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
+ " )\n",
+ "\n",
+ " # Split the temp dataframe into val and test dataframes.\n",
+ " relative_frac_test = frac_test / (frac_val + frac_test)\n",
+ " df_val, df_test, y_val, y_test = train_test_split(\n",
+ " df_temp,\n",
+ " y_temp,\n",
+ " stratify=y_temp,\n",
+ " test_size=relative_frac_test,\n",
+ " random_state=random_state,\n",
+ " )\n",
+ "\n",
+ " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
+ "\n",
+ " return df_train, df_val, df_test"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Pclass\n",
+ "3 491\n",
+ "1 216\n",
+ "2 184\n",
+ "Name: count, dtype: int64\n",
+ "Обучающая выборка: (534, 3)\n",
+ "Pclass\n",
+ "3 294\n",
+ "1 130\n",
+ "2 110\n",
+ "Name: count, dtype: int64\n",
+ "Контрольная выборка: (178, 3)\n",
+ "Pclass\n",
+ "3 98\n",
+ "1 43\n",
+ "2 37\n",
+ "Name: count, dtype: int64\n",
+ "Тестовая выборка: (179, 3)\n",
+ "Pclass\n",
+ "3 99\n",
+ "1 43\n",
+ "2 37\n",
+ "Name: count, dtype: int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Вывод распределения количества наблюдений по меткам (классам)\n",
+ "print(df.Pclass.value_counts())\n",
+ "\n",
+ "data = df[[\"Pclass\", \"Survived\", \"AgeFillMedian\"]].copy()\n",
+ "\n",
+ "df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
+ " data, stratify_colname=\"Pclass\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
+ ")\n",
+ "\n",
+ "print(\"Обучающая выборка: \", df_train.shape)\n",
+ "print(df_train.Pclass.value_counts())\n",
+ "\n",
+ "print(\"Контрольная выборка: \", df_val.shape)\n",
+ "print(df_val.Pclass.value_counts())\n",
+ "\n",
+ "print(\"Тестовая выборка: \", df_test.shape)\n",
+ "print(df_test.Pclass.value_counts())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Выборка с избытком (oversampling)\n",
+ "\n",
+ "https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n",
+ "\n",
+ "https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n",
+ "\n",
+ "Выборка с недостатком (undersampling)\n",
+ "\n",
+ "https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n",
+ "\n",
+ "Библиотека imbalanced-learn\n",
+ "\n",
+ "https://imbalanced-learn.org/stable/"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Обучающая выборка: (534, 3)\n",
+ "Pclass\n",
+ "3 294\n",
+ "1 130\n",
+ "2 110\n",
+ "Name: count, dtype: int64\n",
+ "Обучающая выборка после oversampling: (864, 3)\n",
+ "Pclass\n",
+ "3 294\n",
+ "2 290\n",
+ "1 280\n",
+ "Name: count, dtype: int64\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Pclass | \n",
+ " Survived | \n",
+ " AgeFillMedian | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 28.000000 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 32.000000 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 28.000000 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 45.000000 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 7.000000 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 859 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 26.887761 | \n",
+ "
\n",
+ " \n",
+ " 860 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 0.890459 | \n",
+ "
\n",
+ " \n",
+ " 861 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 17.481437 | \n",
+ "
\n",
+ " \n",
+ " 862 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 17.078473 | \n",
+ "
\n",
+ " \n",
+ " 863 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 17.220445 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
864 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pclass Survived AgeFillMedian\n",
+ "0 3 0 28.000000\n",
+ "1 3 0 32.000000\n",
+ "2 3 1 28.000000\n",
+ "3 1 0 45.000000\n",
+ "4 3 0 7.000000\n",
+ ".. ... ... ...\n",
+ "859 2 0 26.887761\n",
+ "860 2 1 0.890459\n",
+ "861 2 0 17.481437\n",
+ "862 2 0 17.078473\n",
+ "863 2 1 17.220445\n",
+ "\n",
+ "[864 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from imblearn.over_sampling import ADASYN\n",
+ "\n",
+ "ada = ADASYN()\n",
+ "\n",
+ "print(\"Обучающая выборка: \", df_train.shape)\n",
+ "print(df_train.Pclass.value_counts())\n",
+ "\n",
+ "X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Pclass\"])\n",
+ "df_train_adasyn = pd.DataFrame(X_resampled)\n",
+ "\n",
+ "print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
+ "print(df_train_adasyn.Pclass.value_counts())\n",
+ "\n",
+ "df_train_adasyn"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}