From ce8a562bf1790f482bd13586d237a617c6c7dc46 Mon Sep 17 00:00:00 2001 From: revengel66 Date: Sat, 28 Sep 2024 13:49:26 +0300 Subject: [PATCH] cloned lec2 --- lab_2/lec2.ipynb | 935 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 935 insertions(+) create mode 100644 lab_2/lec2.ipynb diff --git a/lab_2/lec2.ipynb b/lab_2/lec2.ipynb new file mode 100644 index 0000000..8388c3d --- /dev/null +++ b/lab_2/lec2.ipynb @@ -0,0 +1,935 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Загрузка данных в DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "<>:3: SyntaxWarning: invalid escape sequence '\\c'\n", + "<>:3: SyntaxWarning: invalid escape sequence '\\c'\n", + "C:\\Users\\New\\AppData\\Local\\Temp\\ipykernel_9568\\2466488670.py:3: SyntaxWarning: invalid escape sequence '\\c'\n", + " df = pd.read_csv(\"static\\csv\\Forbes Billionaires.csv\", index_col=\"PassengerId\")\n", + "C:\\Users\\New\\AppData\\Local\\Temp\\ipykernel_9568\\2466488670.py:3: SyntaxWarning: invalid escape sequence '\\c'\n", + " df = pd.read_csv(\"static\\csv\\Forbes Billionaires.csv\", index_col=\"PassengerId\")\n" + ] + }, + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: 'static\\\\csv\\\\Forbes Billionaires.csv'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[2], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstatic\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43mcsv\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43mForbes Billionaires.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPassengerId\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 5\u001b[0m df\u001b[38;5;241m.\u001b[39minfo()\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28mprint\u001b[39m(df\u001b[38;5;241m.\u001b[39mshape)\n", + "File \u001b[1;32md:\\5semestr\\AIM\\aimvenv\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[0;32m 1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[0;32m 1014\u001b[0m dialect,\n\u001b[0;32m 1015\u001b[0m delimiter,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1022\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[0;32m 1023\u001b[0m )\n\u001b[0;32m 1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[1;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32md:\\5semestr\\AIM\\aimvenv\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[0;32m 619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[1;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[0;32m 623\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", + "File \u001b[1;32md:\\5semestr\\AIM\\aimvenv\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m 1617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32md:\\5semestr\\AIM\\aimvenv\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1880\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[1;34m(self, f, engine)\u001b[0m\n\u001b[0;32m 1878\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[0;32m 1879\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m-> 1880\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1881\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1882\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1883\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1884\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1885\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1886\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1887\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1888\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1889\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1890\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 1891\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", + "File \u001b[1;32md:\\5semestr\\AIM\\aimvenv\\Lib\\site-packages\\pandas\\io\\common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m 868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m 869\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[0;32m 870\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[0;32m 871\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[0;32m 872\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[1;32m--> 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[0;32m 874\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 875\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 876\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 877\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 878\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 879\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[0;32m 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", + "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'static\\\\csv\\\\Forbes Billionaires.csv'" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"static\\csv\\Forbes Billionaires.csv\", index_col=\"PassengerId\")\n", + "\n", + "df.info()\n", + "\n", + "print(df.shape)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Получение сведений о пропущенных данных" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Типы пропущенных данных:\n", + "- None - представление пустых данных в Python\n", + "- NaN - представление пустых данных в Pandas\n", + "- '' - пустая строка" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Survived 0\n", + "Pclass 0\n", + "Name 0\n", + "Sex 0\n", + "Age 177\n", + "SibSp 0\n", + "Parch 0\n", + "Ticket 0\n", + "Fare 0\n", + "Cabin 687\n", + "Embarked 2\n", + "dtype: int64\n", + "\n", + "Survived False\n", + "Pclass False\n", + "Name False\n", + "Sex False\n", + "Age True\n", + "SibSp False\n", + "Parch False\n", + "Ticket False\n", + "Fare False\n", + "Cabin True\n", + "Embarked True\n", + "dtype: bool\n", + "\n", + "Age процент пустых значений: %19.87\n", + "Cabin процент пустых значений: %77.10\n", + "Embarked процент пустых значений: %0.22\n" + ] + } + ], + "source": [ + "# Количество пустых значений признаков\n", + "print(df.isnull().sum())\n", + "\n", + "print()\n", + "\n", + "# Есть ли пустые значения признаков\n", + "print(df.isnull().any())\n", + "\n", + "print()\n", + "\n", + "# Процент пустых значений признаков\n", + "for i in df.columns:\n", + " null_rate = df[i].isnull().sum() / len(df) * 100\n", + " if null_rate > 0:\n", + " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Заполнение пропущенных данных\n", + "\n", + "https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n", + "\n", + "https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(891, 11)\n", + "Survived False\n", + "Pclass False\n", + "Name False\n", + "Sex False\n", + "Age False\n", + "SibSp False\n", + "Parch False\n", + "Ticket False\n", + "Fare False\n", + "Cabin False\n", + "Embarked False\n", + "dtype: bool\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedAgeFillNAAgeFillMedian
PassengerId
88702Montvila, Rev. Juozasmale27.00021153613.00NaNS27.027.0
88811Graham, Miss. Margaret Edithfemale19.00011205330.00B42S19.019.0
88903Johnston, Miss. Catherine Helen \"Carrie\"femaleNaN12W./C. 660723.45NaNS0.028.0
89011Behr, Mr. Karl Howellmale26.00011136930.00C148C26.026.0
89103Dooley, Mr. Patrickmale32.0003703767.75NaNQ32.032.0
\n", + "
" + ], + "text/plain": [ + " Survived Pclass Name \\\n", + "PassengerId \n", + "887 0 2 Montvila, Rev. Juozas \n", + "888 1 1 Graham, Miss. Margaret Edith \n", + "889 0 3 Johnston, Miss. Catherine Helen \"Carrie\" \n", + "890 1 1 Behr, Mr. Karl Howell \n", + "891 0 3 Dooley, Mr. Patrick \n", + "\n", + " Sex Age SibSp Parch Ticket Fare Cabin Embarked \\\n", + "PassengerId \n", + "887 male 27.0 0 0 211536 13.00 NaN S \n", + "888 female 19.0 0 0 112053 30.00 B42 S \n", + "889 female NaN 1 2 W./C. 6607 23.45 NaN S \n", + "890 male 26.0 0 0 111369 30.00 C148 C \n", + "891 male 32.0 0 0 370376 7.75 NaN Q \n", + "\n", + " AgeFillNA AgeFillMedian \n", + "PassengerId \n", + "887 27.0 27.0 \n", + "888 19.0 19.0 \n", + "889 0.0 28.0 \n", + "890 26.0 26.0 \n", + "891 32.0 32.0 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fillna_df = df.fillna(0)\n", + "\n", + "print(fillna_df.shape)\n", + "\n", + "print(fillna_df.isnull().any())\n", + "\n", + "# Замена пустых данных на 0\n", + "df[\"AgeFillNA\"] = df[\"Age\"].fillna(0)\n", + "\n", + "# Замена пустых данных на медиану\n", + "df[\"AgeFillMedian\"] = df[\"Age\"].fillna(df[\"Age\"].median())\n", + "\n", + "df.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedAgeFillNAAgeFillMedianAgeCopy
PassengerId
88702Montvila, Rev. Juozasmale27.00021153613.00NaNS27.027.027.0
88811Graham, Miss. Margaret Edithfemale19.00011205330.00B42S19.019.019.0
88903Johnston, Miss. Catherine Helen \"Carrie\"femaleNaN12W./C. 660723.45NaNS0.028.00.0
89011Behr, Mr. Karl Howellmale26.00011136930.00C148C26.026.026.0
89103Dooley, Mr. Patrickmale32.0003703767.75NaNQ32.032.032.0
\n", + "
" + ], + "text/plain": [ + " Survived Pclass Name \\\n", + "PassengerId \n", + "887 0 2 Montvila, Rev. Juozas \n", + "888 1 1 Graham, Miss. Margaret Edith \n", + "889 0 3 Johnston, Miss. Catherine Helen \"Carrie\" \n", + "890 1 1 Behr, Mr. Karl Howell \n", + "891 0 3 Dooley, Mr. Patrick \n", + "\n", + " Sex Age SibSp Parch Ticket Fare Cabin Embarked \\\n", + "PassengerId \n", + "887 male 27.0 0 0 211536 13.00 NaN S \n", + "888 female 19.0 0 0 112053 30.00 B42 S \n", + "889 female NaN 1 2 W./C. 6607 23.45 NaN S \n", + "890 male 26.0 0 0 111369 30.00 C148 C \n", + "891 male 32.0 0 0 370376 7.75 NaN Q \n", + "\n", + " AgeFillNA AgeFillMedian AgeCopy \n", + "PassengerId \n", + "887 27.0 27.0 27.0 \n", + "888 19.0 19.0 19.0 \n", + "889 0.0 28.0 0.0 \n", + "890 26.0 26.0 26.0 \n", + "891 32.0 32.0 32.0 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"AgeCopy\"] = df[\"Age\"]\n", + "\n", + "# Замена данных сразу в DataFrame без копирования\n", + "df.fillna({\"AgeCopy\": 0}, inplace=True)\n", + "\n", + "df.tail()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Удаление наблюдений с пропусками" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(183, 14)\n", + "Survived False\n", + "Pclass False\n", + "Name False\n", + "Sex False\n", + "Age False\n", + "SibSp False\n", + "Parch False\n", + "Ticket False\n", + "Fare False\n", + "Cabin False\n", + "Embarked False\n", + "dtype: bool\n" + ] + } + ], + "source": [ + "dropna_df = df.dropna()\n", + "\n", + "print(dropna_df.shape)\n", + "\n", + "print(fillna_df.isnull().any())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Создание выборок данных\n", + "\n", + "Библиотека scikit-learn\n", + "\n", + "https://scikit-learn.org/stable/index.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Функция для создания выборок\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "\n", + "def split_stratified_into_train_val_test(\n", + " df_input,\n", + " stratify_colname=\"y\",\n", + " frac_train=0.6,\n", + " frac_val=0.15,\n", + " frac_test=0.25,\n", + " random_state=None,\n", + "):\n", + " \"\"\"\n", + " Splits a Pandas dataframe into three subsets (train, val, and test)\n", + " following fractional ratios provided by the user, where each subset is\n", + " stratified by the values in a specific column (that is, each subset has\n", + " the same relative frequency of the values in the column). It performs this\n", + " splitting by running train_test_split() twice.\n", + "\n", + " Parameters\n", + " ----------\n", + " df_input : Pandas dataframe\n", + " Input dataframe to be split.\n", + " stratify_colname : str\n", + " The name of the column that will be used for stratification. Usually\n", + " this column would be for the label.\n", + " frac_train : float\n", + " frac_val : float\n", + " frac_test : float\n", + " The ratios with which the dataframe will be split into train, val, and\n", + " test data. The values should be expressed as float fractions and should\n", + " sum to 1.0.\n", + " random_state : int, None, or RandomStateInstance\n", + " Value to be passed to train_test_split().\n", + "\n", + " Returns\n", + " -------\n", + " df_train, df_val, df_test :\n", + " Dataframes containing the three splits.\n", + " \"\"\"\n", + "\n", + " if frac_train + frac_val + frac_test != 1.0:\n", + " raise ValueError(\n", + " \"fractions %f, %f, %f do not add up to 1.0\"\n", + " % (frac_train, frac_val, frac_test)\n", + " )\n", + "\n", + " if stratify_colname not in df_input.columns:\n", + " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", + "\n", + " X = df_input # Contains all columns.\n", + " y = df_input[\n", + " [stratify_colname]\n", + " ] # Dataframe of just the column on which to stratify.\n", + "\n", + " # Split original dataframe into train and temp dataframes.\n", + " df_train, df_temp, y_train, y_temp = train_test_split(\n", + " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", + " )\n", + "\n", + " # Split the temp dataframe into val and test dataframes.\n", + " relative_frac_test = frac_test / (frac_val + frac_test)\n", + " df_val, df_test, y_val, y_test = train_test_split(\n", + " df_temp,\n", + " y_temp,\n", + " stratify=y_temp,\n", + " test_size=relative_frac_test,\n", + " random_state=random_state,\n", + " )\n", + "\n", + " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", + "\n", + " return df_train, df_val, df_test" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pclass\n", + "3 491\n", + "1 216\n", + "2 184\n", + "Name: count, dtype: int64\n", + "Обучающая выборка: (534, 3)\n", + "Pclass\n", + "3 294\n", + "1 130\n", + "2 110\n", + "Name: count, dtype: int64\n", + "Контрольная выборка: (178, 3)\n", + "Pclass\n", + "3 98\n", + "1 43\n", + "2 37\n", + "Name: count, dtype: int64\n", + "Тестовая выборка: (179, 3)\n", + "Pclass\n", + "3 99\n", + "1 43\n", + "2 37\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# Вывод распределения количества наблюдений по меткам (классам)\n", + "print(df.Pclass.value_counts())\n", + "\n", + "data = df[[\"Pclass\", \"Survived\", \"AgeFillMedian\"]].copy()\n", + "\n", + "df_train, df_val, df_test = split_stratified_into_train_val_test(\n", + " data, stratify_colname=\"Pclass\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", + ")\n", + "\n", + "print(\"Обучающая выборка: \", df_train.shape)\n", + "print(df_train.Pclass.value_counts())\n", + "\n", + "print(\"Контрольная выборка: \", df_val.shape)\n", + "print(df_val.Pclass.value_counts())\n", + "\n", + "print(\"Тестовая выборка: \", df_test.shape)\n", + "print(df_test.Pclass.value_counts())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Выборка с избытком (oversampling)\n", + "\n", + "https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n", + "\n", + "https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n", + "\n", + "Выборка с недостатком (undersampling)\n", + "\n", + "https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n", + "\n", + "Библиотека imbalanced-learn\n", + "\n", + "https://imbalanced-learn.org/stable/" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Обучающая выборка: (534, 3)\n", + "Pclass\n", + "3 294\n", + "1 130\n", + "2 110\n", + "Name: count, dtype: int64\n", + "Обучающая выборка после oversampling: (864, 3)\n", + "Pclass\n", + "3 294\n", + "2 290\n", + "1 280\n", + "Name: count, dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PclassSurvivedAgeFillMedian
03028.000000
13032.000000
23128.000000
31045.000000
4307.000000
............
8592026.887761
860210.890459
8612017.481437
8622017.078473
8632117.220445
\n", + "

864 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " Pclass Survived AgeFillMedian\n", + "0 3 0 28.000000\n", + "1 3 0 32.000000\n", + "2 3 1 28.000000\n", + "3 1 0 45.000000\n", + "4 3 0 7.000000\n", + ".. ... ... ...\n", + "859 2 0 26.887761\n", + "860 2 1 0.890459\n", + "861 2 0 17.481437\n", + "862 2 0 17.078473\n", + "863 2 1 17.220445\n", + "\n", + "[864 rows x 3 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from imblearn.over_sampling import ADASYN\n", + "\n", + "ada = ADASYN()\n", + "\n", + "print(\"Обучающая выборка: \", df_train.shape)\n", + "print(df_train.Pclass.value_counts())\n", + "\n", + "X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Pclass\"])\n", + "df_train_adasyn = pd.DataFrame(X_resampled)\n", + "\n", + "print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n", + "print(df_train_adasyn.Pclass.value_counts())\n", + "\n", + "df_train_adasyn" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}