{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Загрузка данных в DataFrame" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "<>:3: SyntaxWarning: invalid escape sequence '\\c'\n", "<>:3: SyntaxWarning: invalid escape sequence '\\c'\n", "C:\\Users\\New\\AppData\\Local\\Temp\\ipykernel_9568\\2466488670.py:3: SyntaxWarning: invalid escape sequence '\\c'\n", " df = pd.read_csv(\"static\\csv\\Forbes Billionaires.csv\", index_col=\"PassengerId\")\n", "C:\\Users\\New\\AppData\\Local\\Temp\\ipykernel_9568\\2466488670.py:3: SyntaxWarning: invalid escape sequence '\\c'\n", " df = pd.read_csv(\"static\\csv\\Forbes Billionaires.csv\", index_col=\"PassengerId\")\n" ] }, { "ename": "FileNotFoundError", "evalue": "[Errno 2] No such file or directory: 'static\\\\csv\\\\Forbes Billionaires.csv'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[2], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstatic\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43mcsv\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43mForbes Billionaires.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPassengerId\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 5\u001b[0m df\u001b[38;5;241m.\u001b[39minfo()\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28mprint\u001b[39m(df\u001b[38;5;241m.\u001b[39mshape)\n", "File \u001b[1;32md:\\5semestr\\AIM\\aimvenv\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[0;32m 1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[0;32m 1014\u001b[0m dialect,\n\u001b[0;32m 1015\u001b[0m delimiter,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1022\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[0;32m 1023\u001b[0m )\n\u001b[0;32m 1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[1;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32md:\\5semestr\\AIM\\aimvenv\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[0;32m 619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[1;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[0;32m 623\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", "File \u001b[1;32md:\\5semestr\\AIM\\aimvenv\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m 1617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32md:\\5semestr\\AIM\\aimvenv\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1880\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[1;34m(self, f, engine)\u001b[0m\n\u001b[0;32m 1878\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[0;32m 1879\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m-> 1880\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1881\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1882\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1883\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1884\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1885\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1886\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1887\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1888\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1889\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1890\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 1891\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", "File \u001b[1;32md:\\5semestr\\AIM\\aimvenv\\Lib\\site-packages\\pandas\\io\\common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m 868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m 869\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[0;32m 870\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[0;32m 871\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[0;32m 872\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[1;32m--> 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[0;32m 874\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 875\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 876\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 877\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 878\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 879\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[0;32m 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'static\\\\csv\\\\Forbes Billionaires.csv'" ] } ], "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv(\"static\\csv\\Forbes Billionaires.csv\", index_col=\"PassengerId\")\n", "\n", "df.info()\n", "\n", "print(df.shape)\n", "\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Получение сведений о пропущенных данных" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Типы пропущенных данных:\n", "- None - представление пустых данных в Python\n", "- NaN - представление пустых данных в Pandas\n", "- '' - пустая строка" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Survived 0\n", "Pclass 0\n", "Name 0\n", "Sex 0\n", "Age 177\n", "SibSp 0\n", "Parch 0\n", "Ticket 0\n", "Fare 0\n", "Cabin 687\n", "Embarked 2\n", "dtype: int64\n", "\n", "Survived False\n", "Pclass False\n", "Name False\n", "Sex False\n", "Age True\n", "SibSp False\n", "Parch False\n", "Ticket False\n", "Fare False\n", "Cabin True\n", "Embarked True\n", "dtype: bool\n", "\n", "Age процент пустых значений: %19.87\n", "Cabin процент пустых значений: %77.10\n", "Embarked процент пустых значений: %0.22\n" ] } ], "source": [ "# Количество пустых значений признаков\n", "print(df.isnull().sum())\n", "\n", "print()\n", "\n", "# Есть ли пустые значения признаков\n", "print(df.isnull().any())\n", "\n", "print()\n", "\n", "# Процент пустых значений признаков\n", "for i in df.columns:\n", " null_rate = df[i].isnull().sum() / len(df) * 100\n", " if null_rate > 0:\n", " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Заполнение пропущенных данных\n", "\n", "https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n", "\n", "https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(891, 11)\n", "Survived False\n", "Pclass False\n", "Name False\n", "Sex False\n", "Age False\n", "SibSp False\n", "Parch False\n", "Ticket False\n", "Fare False\n", "Cabin False\n", "Embarked False\n", "dtype: bool\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedAgeFillNAAgeFillMedian
PassengerId
88702Montvila, Rev. Juozasmale27.00021153613.00NaNS27.027.0
88811Graham, Miss. Margaret Edithfemale19.00011205330.00B42S19.019.0
88903Johnston, Miss. Catherine Helen \"Carrie\"femaleNaN12W./C. 660723.45NaNS0.028.0
89011Behr, Mr. Karl Howellmale26.00011136930.00C148C26.026.0
89103Dooley, Mr. Patrickmale32.0003703767.75NaNQ32.032.0
\n", "
" ], "text/plain": [ " Survived Pclass Name \\\n", "PassengerId \n", "887 0 2 Montvila, Rev. Juozas \n", "888 1 1 Graham, Miss. Margaret Edith \n", "889 0 3 Johnston, Miss. Catherine Helen \"Carrie\" \n", "890 1 1 Behr, Mr. Karl Howell \n", "891 0 3 Dooley, Mr. Patrick \n", "\n", " Sex Age SibSp Parch Ticket Fare Cabin Embarked \\\n", "PassengerId \n", "887 male 27.0 0 0 211536 13.00 NaN S \n", "888 female 19.0 0 0 112053 30.00 B42 S \n", "889 female NaN 1 2 W./C. 6607 23.45 NaN S \n", "890 male 26.0 0 0 111369 30.00 C148 C \n", "891 male 32.0 0 0 370376 7.75 NaN Q \n", "\n", " AgeFillNA AgeFillMedian \n", "PassengerId \n", "887 27.0 27.0 \n", "888 19.0 19.0 \n", "889 0.0 28.0 \n", "890 26.0 26.0 \n", "891 32.0 32.0 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fillna_df = df.fillna(0)\n", "\n", "print(fillna_df.shape)\n", "\n", "print(fillna_df.isnull().any())\n", "\n", "# Замена пустых данных на 0\n", "df[\"AgeFillNA\"] = df[\"Age\"].fillna(0)\n", "\n", "# Замена пустых данных на медиану\n", "df[\"AgeFillMedian\"] = df[\"Age\"].fillna(df[\"Age\"].median())\n", "\n", "df.tail()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedAgeFillNAAgeFillMedianAgeCopy
PassengerId
88702Montvila, Rev. Juozasmale27.00021153613.00NaNS27.027.027.0
88811Graham, Miss. Margaret Edithfemale19.00011205330.00B42S19.019.019.0
88903Johnston, Miss. Catherine Helen \"Carrie\"femaleNaN12W./C. 660723.45NaNS0.028.00.0
89011Behr, Mr. Karl Howellmale26.00011136930.00C148C26.026.026.0
89103Dooley, Mr. Patrickmale32.0003703767.75NaNQ32.032.032.0
\n", "
" ], "text/plain": [ " Survived Pclass Name \\\n", "PassengerId \n", "887 0 2 Montvila, Rev. Juozas \n", "888 1 1 Graham, Miss. Margaret Edith \n", "889 0 3 Johnston, Miss. Catherine Helen \"Carrie\" \n", "890 1 1 Behr, Mr. Karl Howell \n", "891 0 3 Dooley, Mr. Patrick \n", "\n", " Sex Age SibSp Parch Ticket Fare Cabin Embarked \\\n", "PassengerId \n", "887 male 27.0 0 0 211536 13.00 NaN S \n", "888 female 19.0 0 0 112053 30.00 B42 S \n", "889 female NaN 1 2 W./C. 6607 23.45 NaN S \n", "890 male 26.0 0 0 111369 30.00 C148 C \n", "891 male 32.0 0 0 370376 7.75 NaN Q \n", "\n", " AgeFillNA AgeFillMedian AgeCopy \n", "PassengerId \n", "887 27.0 27.0 27.0 \n", "888 19.0 19.0 19.0 \n", "889 0.0 28.0 0.0 \n", "890 26.0 26.0 26.0 \n", "891 32.0 32.0 32.0 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"AgeCopy\"] = df[\"Age\"]\n", "\n", "# Замена данных сразу в DataFrame без копирования\n", "df.fillna({\"AgeCopy\": 0}, inplace=True)\n", "\n", "df.tail()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Удаление наблюдений с пропусками" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(183, 14)\n", "Survived False\n", "Pclass False\n", "Name False\n", "Sex False\n", "Age False\n", "SibSp False\n", "Parch False\n", "Ticket False\n", "Fare False\n", "Cabin False\n", "Embarked False\n", "dtype: bool\n" ] } ], "source": [ "dropna_df = df.dropna()\n", "\n", "print(dropna_df.shape)\n", "\n", "print(fillna_df.isnull().any())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Создание выборок данных\n", "\n", "Библиотека scikit-learn\n", "\n", "https://scikit-learn.org/stable/index.html" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Функция для создания выборок\n", "from sklearn.model_selection import train_test_split\n", "\n", "\n", "def split_stratified_into_train_val_test(\n", " df_input,\n", " stratify_colname=\"y\",\n", " frac_train=0.6,\n", " frac_val=0.15,\n", " frac_test=0.25,\n", " random_state=None,\n", "):\n", " \"\"\"\n", " Splits a Pandas dataframe into three subsets (train, val, and test)\n", " following fractional ratios provided by the user, where each subset is\n", " stratified by the values in a specific column (that is, each subset has\n", " the same relative frequency of the values in the column). It performs this\n", " splitting by running train_test_split() twice.\n", "\n", " Parameters\n", " ----------\n", " df_input : Pandas dataframe\n", " Input dataframe to be split.\n", " stratify_colname : str\n", " The name of the column that will be used for stratification. Usually\n", " this column would be for the label.\n", " frac_train : float\n", " frac_val : float\n", " frac_test : float\n", " The ratios with which the dataframe will be split into train, val, and\n", " test data. The values should be expressed as float fractions and should\n", " sum to 1.0.\n", " random_state : int, None, or RandomStateInstance\n", " Value to be passed to train_test_split().\n", "\n", " Returns\n", " -------\n", " df_train, df_val, df_test :\n", " Dataframes containing the three splits.\n", " \"\"\"\n", "\n", " if frac_train + frac_val + frac_test != 1.0:\n", " raise ValueError(\n", " \"fractions %f, %f, %f do not add up to 1.0\"\n", " % (frac_train, frac_val, frac_test)\n", " )\n", "\n", " if stratify_colname not in df_input.columns:\n", " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", "\n", " X = df_input # Contains all columns.\n", " y = df_input[\n", " [stratify_colname]\n", " ] # Dataframe of just the column on which to stratify.\n", "\n", " # Split original dataframe into train and temp dataframes.\n", " df_train, df_temp, y_train, y_temp = train_test_split(\n", " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", " )\n", "\n", " # Split the temp dataframe into val and test dataframes.\n", " relative_frac_test = frac_test / (frac_val + frac_test)\n", " df_val, df_test, y_val, y_test = train_test_split(\n", " df_temp,\n", " y_temp,\n", " stratify=y_temp,\n", " test_size=relative_frac_test,\n", " random_state=random_state,\n", " )\n", "\n", " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", "\n", " return df_train, df_val, df_test" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pclass\n", "3 491\n", "1 216\n", "2 184\n", "Name: count, dtype: int64\n", "Обучающая выборка: (534, 3)\n", "Pclass\n", "3 294\n", "1 130\n", "2 110\n", "Name: count, dtype: int64\n", "Контрольная выборка: (178, 3)\n", "Pclass\n", "3 98\n", "1 43\n", "2 37\n", "Name: count, dtype: int64\n", "Тестовая выборка: (179, 3)\n", "Pclass\n", "3 99\n", "1 43\n", "2 37\n", "Name: count, dtype: int64\n" ] } ], "source": [ "# Вывод распределения количества наблюдений по меткам (классам)\n", "print(df.Pclass.value_counts())\n", "\n", "data = df[[\"Pclass\", \"Survived\", \"AgeFillMedian\"]].copy()\n", "\n", "df_train, df_val, df_test = split_stratified_into_train_val_test(\n", " data, stratify_colname=\"Pclass\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", ")\n", "\n", "print(\"Обучающая выборка: \", df_train.shape)\n", "print(df_train.Pclass.value_counts())\n", "\n", "print(\"Контрольная выборка: \", df_val.shape)\n", "print(df_val.Pclass.value_counts())\n", "\n", "print(\"Тестовая выборка: \", df_test.shape)\n", "print(df_test.Pclass.value_counts())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Выборка с избытком (oversampling)\n", "\n", "https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n", "\n", "https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n", "\n", "Выборка с недостатком (undersampling)\n", "\n", "https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n", "\n", "Библиотека imbalanced-learn\n", "\n", "https://imbalanced-learn.org/stable/" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Обучающая выборка: (534, 3)\n", "Pclass\n", "3 294\n", "1 130\n", "2 110\n", "Name: count, dtype: int64\n", "Обучающая выборка после oversampling: (864, 3)\n", "Pclass\n", "3 294\n", "2 290\n", "1 280\n", "Name: count, dtype: int64\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PclassSurvivedAgeFillMedian
03028.000000
13032.000000
23128.000000
31045.000000
4307.000000
............
8592026.887761
860210.890459
8612017.481437
8622017.078473
8632117.220445
\n", "

864 rows × 3 columns

\n", "
" ], "text/plain": [ " Pclass Survived AgeFillMedian\n", "0 3 0 28.000000\n", "1 3 0 32.000000\n", "2 3 1 28.000000\n", "3 1 0 45.000000\n", "4 3 0 7.000000\n", ".. ... ... ...\n", "859 2 0 26.887761\n", "860 2 1 0.890459\n", "861 2 0 17.481437\n", "862 2 0 17.078473\n", "863 2 1 17.220445\n", "\n", "[864 rows x 3 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from imblearn.over_sampling import ADASYN\n", "\n", "ada = ADASYN()\n", "\n", "print(\"Обучающая выборка: \", df_train.shape)\n", "print(df_train.Pclass.value_counts())\n", "\n", "X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Pclass\"])\n", "df_train_adasyn = pd.DataFrame(X_resampled)\n", "\n", "print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n", "print(df_train_adasyn.Pclass.value_counts())\n", "\n", "df_train_adasyn" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.6" } }, "nbformat": 4, "nbformat_minor": 2 }