MII/lec2.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Загрузка данных в DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 235 entries, 1 to 235\n",
      "Data columns (total 11 columns):\n",
      " #   Column                   Non-Null Count  Dtype \n",
      "---  ------                   --------------  ----- \n",
      " 0   Country (or dependency)  235 non-null    object\n",
      " 1   Population 2020          235 non-null    object\n",
      " 2   Yearly Change            235 non-null    object\n",
      " 3   Net Change               235 non-null    object\n",
      " 4   Density(P/Km²)           235 non-null    object\n",
      " 5   Land Area (Km²)          235 non-null    object\n",
      " 6   Migrants (net)           201 non-null    object\n",
      " 7   Fert. Rate               235 non-null    object\n",
      " 8   MedAge                   235 non-null    object\n",
      " 9   Urban Pop %              235 non-null    object\n",
      " 10  World Share              235 non-null    object\n",
      "dtypes: object(11)\n",
      "memory usage: 22.0+ KB\n",
      "(235, 11)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Country (or dependency)</th>\n",
       "      <th>Population 2020</th>\n",
       "      <th>Yearly Change</th>\n",
       "      <th>Net Change</th>\n",
       "      <th>Density(P/Km²)</th>\n",
       "      <th>Land Area (Km²)</th>\n",
       "      <th>Migrants (net)</th>\n",
       "      <th>Fert. Rate</th>\n",
       "      <th>MedAge</th>\n",
       "      <th>Urban Pop %</th>\n",
       "      <th>World Share</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>no</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>China</td>\n",
       "      <td>1,439,323,776</td>\n",
       "      <td>0.39%</td>\n",
       "      <td>5,540,090</td>\n",
       "      <td>153</td>\n",
       "      <td>9,388,211</td>\n",
       "      <td>-348,399</td>\n",
       "      <td>1.7</td>\n",
       "      <td>38</td>\n",
       "      <td>61%</td>\n",
       "      <td>18.47%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>India</td>\n",
       "      <td>1,380,004,385</td>\n",
       "      <td>0.99%</td>\n",
       "      <td>13,586,631</td>\n",
       "      <td>464</td>\n",
       "      <td>2,973,190</td>\n",
       "      <td>-532,687</td>\n",
       "      <td>2.2</td>\n",
       "      <td>28</td>\n",
       "      <td>35%</td>\n",
       "      <td>17.70%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>United States</td>\n",
       "      <td>331,002,651</td>\n",
       "      <td>0.59%</td>\n",
       "      <td>1,937,734</td>\n",
       "      <td>36</td>\n",
       "      <td>9,147,420</td>\n",
       "      <td>954,806</td>\n",
       "      <td>1.8</td>\n",
       "      <td>38</td>\n",
       "      <td>83%</td>\n",
       "      <td>4.25%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Indonesia</td>\n",
       "      <td>273,523,615</td>\n",
       "      <td>1.07%</td>\n",
       "      <td>2,898,047</td>\n",
       "      <td>151</td>\n",
       "      <td>1,811,570</td>\n",
       "      <td>-98,955</td>\n",
       "      <td>2.3</td>\n",
       "      <td>30</td>\n",
       "      <td>56%</td>\n",
       "      <td>3.51%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Pakistan</td>\n",
       "      <td>220,892,340</td>\n",
       "      <td>2.00%</td>\n",
       "      <td>4,327,022</td>\n",
       "      <td>287</td>\n",
       "      <td>770,880</td>\n",
       "      <td>-233,379</td>\n",
       "      <td>3.6</td>\n",
       "      <td>23</td>\n",
       "      <td>35%</td>\n",
       "      <td>2.83%</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Country (or dependency) Population 2020 Yearly Change  Net Change  \\\n",
       "no                                                                     \n",
       "1                    China   1,439,323,776         0.39%   5,540,090   \n",
       "2                    India   1,380,004,385         0.99%  13,586,631   \n",
       "3            United States     331,002,651         0.59%   1,937,734   \n",
       "4                Indonesia     273,523,615         1.07%   2,898,047   \n",
       "5                 Pakistan     220,892,340         2.00%   4,327,022   \n",
       "\n",
       "   Density(P/Km²) Land Area (Km²) Migrants (net) Fert. Rate MedAge  \\\n",
       "no                                                                   \n",
       "1             153       9,388,211       -348,399        1.7     38   \n",
       "2             464       2,973,190       -532,687        2.2     28   \n",
       "3              36       9,147,420        954,806        1.8     38   \n",
       "4             151       1,811,570        -98,955        2.3     30   \n",
       "5             287         770,880       -233,379        3.6     23   \n",
       "\n",
       "   Urban Pop % World Share  \n",
       "no                          \n",
       "1          61%      18.47%  \n",
       "2          35%      17.70%  \n",
       "3          83%       4.25%  \n",
       "4          56%       3.51%  \n",
       "5          35%       2.83%  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"data/population.csv\", index_col=\"no\")\n",
    "\n",
    "df.info()\n",
    "\n",
    "print(df.shape)\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Получение сведений о пропущенных данных"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Типы пропущенных данных:\n",
    "- None - представление пустых данных в Python\n",
    "- NaN - представление пустых данных в Pandas\n",
    "- '' - пустая строка"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Country (or dependency)     0\n",
      "Population 2020             0\n",
      "Yearly Change               0\n",
      "Net Change                  0\n",
      "Density  (P/Km²)            0\n",
      "Land Area (Km²)             0\n",
      "Migrants (net)             34\n",
      "Fert. Rate                  0\n",
      "Med. Age                    0\n",
      "Urban Pop %                 0\n",
      "World Share                 0\n",
      "dtype: int64\n",
      "\n",
      "Country (or dependency)    False\n",
      "Population 2020            False\n",
      "Yearly Change              False\n",
      "Net Change                 False\n",
      "Density  (P/Km²)           False\n",
      "Land Area (Km²)            False\n",
      "Migrants (net)              True\n",
      "Fert. Rate                 False\n",
      "Med. Age                   False\n",
      "Urban Pop %                False\n",
      "World Share                False\n",
      "dtype: bool\n",
      "\n",
      "Migrants (net) процент пустых значений: %14.47\n"
     ]
    }
   ],
   "source": [
    "\n",
    "\n",
    "# Количество пустых значений признаков\n",
    "print(df.isnull().sum())\n",
    "\n",
    "print()\n",
    "\n",
    "# Есть ли пустые значения признаков\n",
    "print(df.isnull().any())\n",
    "\n",
    "print()\n",
    "\n",
    "# Процент пустых значений признаков\n",
    "for i in df.columns:\n",
    "    null_rate = df[i].isnull().sum() / len(df) * 100\n",
    "    if null_rate > 0:\n",
    "        print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Заполнение пропущенных данных\n",
    "\n",
    "https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n",
    "\n",
    "https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(235, 12)\n",
      "Country (or dependency)    False\n",
      "Population 2020            False\n",
      "Yearly Change              False\n",
      "Net Change                 False\n",
      "Density  (P/Km²)           False\n",
      "Land Area (Km²)            False\n",
      "Migrants (net)             False\n",
      "Fert. Rate                 False\n",
      "Med. Age                   False\n",
      "Urban Pop %                False\n",
      "World Share                False\n",
      "MigrantsFill               False\n",
      "dtype: bool\n"
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "Cannot convert ['-348,399' '-532,687' '954,806' '-98,955' '-233,379' '21,200' '-60,000'\n '-369,501' '182,456' '-60,000' '71,560' '30,000' '-67,152' '-38,033'\n '-80,000' '23,861' '283,922' '-55,000' '543,822' '19,444' '260,650'\n '36,527' '148,943' '-40,076' '145,405' '-163,313' '-10,000' '11,731'\n '204,796' '40,000' '168,694' '4,800' '-10,000' '-50,000' '10,000' '7,834'\n '-62,920' '-29,395' '242,032' '-51,419' '134,979' '-8,863' '99,069'\n '6,413' '50,000' '-5,000' '-10,000' '-30,000' '41,710' '-653,249'\n '-1,500' '-4,800' '-8,000' '-5,403' '158,246' '4,000' '30,001' '-97,986'\n '-25,000' '-40,000' '-73,999' '-16,053' '111,708' '-18,000' '-8,000'\n '-9,215' '36,400' '-427,391' '16,000' '-20,000' '-30,000' '2,000'\n '-40,000' '-116,858' '-4,000' '-9,000' '-2,000' '2,001' '-4,000' '-9,504'\n '48,000' '-35,000' '-14,400' '-174,200' '-30,000' '22,011' '-16,000'\n '10,220' '-6,000' '1,200' '40,000' '-6,800' '40,000' '6,000' '-20,000'\n '8,730' '65,000' '-800' '4,000' '10,000' '52,000' '-2,000' '-4,200'\n '29,308' '-14,704' '-16,556' '-4,800' '-1,999' '-30,012' '-21,272'\n '-4,000' '-40,539' '-5,000' '27,028' '15,200' '14,000' '-4,000' '1,485'\n '28,000' '87,400' '-10,563' '4,200' '-5,000' '23,604' '-40,000' '14,881'\n '5,000' '11,200' '39,520' '-8,001' '-1,387' '-10,000' '-39,858' '-3,000'\n '-21,585' '-852' '-4,998' '-11,332' '40,000' '-14,000' '-97,986'\n '-32,780' '-4,806' '-3,087' '3,000' '3,260' '-10,047' '-1,000' '2,000'\n '-1,399' '-14,837' '47,800' '16,000' '-800' '3,911' '-5,385' '0' '5,000'\n '-8,353' '900' '-6,202' '-1,256' '-2,000' '-6,000' '320' '-1,600' '5,000'\n '-480' '9,741' '5,582' '-1,000' '-1,342' '-2,957' '11,370' '900' '0'\n '-1,440' '1,200' '1,000' '-960' '380' '120' '1,200' '-79' '502' '-1,000'\n '0' '-1,680' '-2,803' '0' '1,351' '-506' '515' '-800' '-200' '-200' '201'\n '-800' '-451' '-200' '0' nan nan nan nan nan nan nan nan nan nan nan nan\n nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan\n nan nan nan nan] to numeric",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[11], line 11\u001b[0m\n\u001b[0;32m      8\u001b[0m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMigrantsFill\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMigrants (net)\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mfillna(\u001b[38;5;241m0\u001b[39m)\n\u001b[0;32m     10\u001b[0m \u001b[38;5;66;03m# Замена пустых данных на медиану\u001b[39;00m\n\u001b[1;32m---> 11\u001b[0m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMigrantsMedian\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMigrants (net)\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mfillna(\u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mMigrants (net)\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmedian\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[0;32m     13\u001b[0m df\u001b[38;5;241m.\u001b[39mtail()\n",
      "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\pandas\\core\\series.py:6559\u001b[0m, in \u001b[0;36mSeries.median\u001b[1;34m(self, axis, skipna, numeric_only, **kwargs)\u001b[0m\n\u001b[0;32m   6551\u001b[0m \u001b[38;5;129m@doc\u001b[39m(make_doc(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmedian\u001b[39m\u001b[38;5;124m\"\u001b[39m, ndim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m))\n\u001b[0;32m   6552\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mmedian\u001b[39m(\n\u001b[0;32m   6553\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   6557\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m   6558\u001b[0m ):\n\u001b[1;32m-> 6559\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mNDFrame\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmedian\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnumeric_only\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\pandas\\core\\generic.py:12431\u001b[0m, in \u001b[0;36mNDFrame.median\u001b[1;34m(self, axis, skipna, numeric_only, **kwargs)\u001b[0m\n\u001b[0;32m  12424\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mmedian\u001b[39m(\n\u001b[0;32m  12425\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m  12426\u001b[0m     axis: Axis \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m  12429\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m  12430\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Series \u001b[38;5;241m|\u001b[39m \u001b[38;5;28mfloat\u001b[39m:\n\u001b[1;32m> 12431\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_stat_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m  12432\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmedian\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnanops\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnanmedian\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnumeric_only\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[0;32m  12433\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\pandas\\core\\generic.py:12377\u001b[0m, in \u001b[0;36mNDFrame._stat_function\u001b[1;34m(self, name, func, axis, skipna, numeric_only, **kwargs)\u001b[0m\n\u001b[0;32m  12373\u001b[0m nv\u001b[38;5;241m.\u001b[39mvalidate_func(name, (), kwargs)\n\u001b[0;32m  12375\u001b[0m validate_bool_kwarg(skipna, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mskipna\u001b[39m\u001b[38;5;124m\"\u001b[39m, none_allowed\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m> 12377\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_reduce\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m  12378\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnumeric_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnumeric_only\u001b[49m\n\u001b[0;32m  12379\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\pandas\\core\\series.py:6457\u001b[0m, in \u001b[0;36mSeries._reduce\u001b[1;34m(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)\u001b[0m\n\u001b[0;32m   6452\u001b[0m     \u001b[38;5;66;03m# GH#47500 - change to TypeError to match other methods\u001b[39;00m\n\u001b[0;32m   6453\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[0;32m   6454\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSeries.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m does not allow \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkwd_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnumeric_only\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   6455\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwith non-numeric dtypes.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   6456\u001b[0m     )\n\u001b[1;32m-> 6457\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdelegate\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\pandas\\core\\nanops.py:147\u001b[0m, in \u001b[0;36mbottleneck_switch.__call__.<locals>.f\u001b[1;34m(values, axis, skipna, **kwds)\u001b[0m\n\u001b[0;32m    145\u001b[0m         result \u001b[38;5;241m=\u001b[39m alt(values, axis\u001b[38;5;241m=\u001b[39maxis, skipna\u001b[38;5;241m=\u001b[39mskipna, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds)\n\u001b[0;32m    146\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 147\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[43malt\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    149\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n",
      "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\pandas\\core\\nanops.py:787\u001b[0m, in \u001b[0;36mnanmedian\u001b[1;34m(values, axis, skipna, mask)\u001b[0m\n\u001b[0;32m    785\u001b[0m     inferred \u001b[38;5;241m=\u001b[39m lib\u001b[38;5;241m.\u001b[39minfer_dtype(values)\n\u001b[0;32m    786\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m inferred \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstring\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmixed\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[1;32m--> 787\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot convert \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalues\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m to numeric\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m    788\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m    789\u001b[0m     values \u001b[38;5;241m=\u001b[39m values\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mf8\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
      "\u001b[1;31mTypeError\u001b[0m: Cannot convert ['-348,399' '-532,687' '954,806' '-98,955' '-233,379' '21,200' '-60,000'\n '-369,501' '182,456' '-60,000' '71,560' '30,000' '-67,152' '-38,033'\n '-80,000' '23,861' '283,922' '-55,000' '543,822' '19,444' '260,650'\n '36,527' '148,943' '-40,076' '145,405' '-163,313' '-10,000' '11,731'\n '204,796' '40,000' '168,694' '4,800' '-10,000' '-50,000' '10,000' '7,834'\n '-62,920' '-29,395' '242,032' '-51,419' '134,979' '-8,863' '99,069'\n '6,413' '50,000' '-5,000' '-10,000' '-30,000' '41,710' '-653,249'\n '-1,500' '-4,800' '-8,000' '-5,403' '158,246' '4,000' '30,001' '-97,986'\n '-25,000' '-40,000' '-73,999' '-16,053' '111,708' '-18,000' '-8,000'\n '-9,215' '36,400' '-427,391' '16,000' '-20,000' '-30,000' '2,000'\n '-40,000' '-116,858' '-4,000' '-9,000' '-2,000' '2,001' '-4,000' '-9,504'\n '48,000' '-35,000' '-14,400' '-174,200' '-30,000' '22,011' '-16,000'\n '10,220' '-6,000' '1,200' '40,000' '-6,800' '40,000' '6,000' '-20,000'\n '8,730' '65,000' '-800' '4,000' '10,000' '52,000' '-2,000' '-4,200'\n '29,308' '-14,704' '-16,556' '-4,800' '-1,999' '-30,012' '-21,272'\n '-4,000' '-40,539' '-5,000' '27,028' '15,200' '14,000' '-4,000' '1,485'\n '28,000' '87,400' '-10,563' '4,200' '-5,000' '23,604' '-40,000' '14,881'\n '5,000' '11,200' '39,520' '-8,001' '-1,387' '-10,000' '-39,858' '-3,000'\n '-21,585' '-852' '-4,998' '-11,332' '40,000' '-14,000' '-97,986'\n '-32,780' '-4,806' '-3,087' '3,000' '3,260' '-10,047' '-1,000' '2,000'\n '-1,399' '-14,837' '47,800' '16,000' '-800' '3,911' '-5,385' '0' '5,000'\n '-8,353' '900' '-6,202' '-1,256' '-2,000' '-6,000' '320' '-1,600' '5,000'\n '-480' '9,741' '5,582' '-1,000' '-1,342' '-2,957' '11,370' '900' '0'\n '-1,440' '1,200' '1,000' '-960' '380' '120' '1,200' '-79' '502' '-1,000'\n '0' '-1,680' '-2,803' '0' '1,351' '-506' '515' '-800' '-200' '-200' '201'\n '-800' '-451' '-200' '0' nan nan nan nan nan nan nan nan nan nan nan nan\n nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan\n nan nan nan nan] to numeric"
     ]
    }
   ],
   "source": [
    "fillna_df = df.fillna(0)\n",
    "\n",
    "print(fillna_df.shape) # размеры\n",
    "\n",
    "print(fillna_df.isnull().any())\n",
    "\n",
    "# Замена пустых данных на 0\n",
    "df[\"MigrantsFill\"] = df[\"Migrants (net)\"].fillna(0)\n",
    "\n",
    "# Замена пустых данных на медиану\n",
    "df[\"MigrantsMedian\"] = df[\"Migrants (net)\"].fillna(df[\"Migrants (net)\"].median())\n",
    "\n",
    "df.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Country (or dependency)</th>\n",
       "      <th>Population 2020</th>\n",
       "      <th>Yearly Change</th>\n",
       "      <th>Net Change</th>\n",
       "      <th>Density  (P/Km²)</th>\n",
       "      <th>Land Area (Km²)</th>\n",
       "      <th>Migrants (net)</th>\n",
       "      <th>Fert. Rate</th>\n",
       "      <th>Med. Age</th>\n",
       "      <th>Urban Pop %</th>\n",
       "      <th>World Share</th>\n",
       "      <th>MigrantsFill</th>\n",
       "      <th>MigrantsCopy</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>no</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>231</th>\n",
       "      <td>Montserrat</td>\n",
       "      <td>4,992</td>\n",
       "      <td>0.06%</td>\n",
       "      <td>3</td>\n",
       "      <td>50</td>\n",
       "      <td>100</td>\n",
       "      <td>NaN</td>\n",
       "      <td>N.A.</td>\n",
       "      <td>N.A.</td>\n",
       "      <td>10%</td>\n",
       "      <td>0.00%</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>232</th>\n",
       "      <td>Falkland Islands</td>\n",
       "      <td>3,480</td>\n",
       "      <td>3.05%</td>\n",
       "      <td>103</td>\n",
       "      <td>0</td>\n",
       "      <td>12,170</td>\n",
       "      <td>NaN</td>\n",
       "      <td>N.A.</td>\n",
       "      <td>N.A.</td>\n",
       "      <td>66%</td>\n",
       "      <td>0.00%</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>233</th>\n",
       "      <td>Niue</td>\n",
       "      <td>1,626</td>\n",
       "      <td>0.68%</td>\n",
       "      <td>11</td>\n",
       "      <td>6</td>\n",
       "      <td>260</td>\n",
       "      <td>NaN</td>\n",
       "      <td>N.A.</td>\n",
       "      <td>N.A.</td>\n",
       "      <td>46%</td>\n",
       "      <td>0.00%</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>234</th>\n",
       "      <td>Tokelau</td>\n",
       "      <td>1,357</td>\n",
       "      <td>1.27%</td>\n",
       "      <td>17</td>\n",
       "      <td>136</td>\n",
       "      <td>10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>N.A.</td>\n",
       "      <td>N.A.</td>\n",
       "      <td>0%</td>\n",
       "      <td>0.00%</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>235</th>\n",
       "      <td>Holy See</td>\n",
       "      <td>801</td>\n",
       "      <td>0.25%</td>\n",
       "      <td>2</td>\n",
       "      <td>2,003</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>N.A.</td>\n",
       "      <td>N.A.</td>\n",
       "      <td>N.A.</td>\n",
       "      <td>0.00%</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Country (or dependency) Population 2020 Yearly Change Net Change  \\\n",
       "no                                                                     \n",
       "231              Montserrat           4,992         0.06%          3   \n",
       "232        Falkland Islands           3,480         3.05%        103   \n",
       "233                    Niue           1,626         0.68%         11   \n",
       "234                 Tokelau           1,357         1.27%         17   \n",
       "235                Holy See             801         0.25%          2   \n",
       "\n",
       "    Density  (P/Km²) Land Area (Km²) Migrants (net) Fert. Rate Med. Age  \\\n",
       "no                                                                        \n",
       "231               50             100            NaN       N.A.     N.A.   \n",
       "232                0          12,170            NaN       N.A.     N.A.   \n",
       "233                6             260            NaN       N.A.     N.A.   \n",
       "234              136              10            NaN       N.A.     N.A.   \n",
       "235            2,003               0            NaN       N.A.     N.A.   \n",
       "\n",
       "    Urban Pop % World Share MigrantsFill MigrantsCopy  \n",
       "no                                                     \n",
       "231         10%       0.00%            0            0  \n",
       "232         66%       0.00%            0            0  \n",
       "233         46%       0.00%            0            0  \n",
       "234          0%       0.00%            0            0  \n",
       "235        N.A.       0.00%            0            0  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"MigrantsCopy\"] = df[\"Migrants (net)\"]\n",
    "\n",
    "# Замена данных сразу в DataFrame без копирования\n",
    "df.fillna({\"MigrantsCopy\": 0}, inplace=True)\n",
    "\n",
    "df.tail()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Удаление наблюдений с пропусками"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(201, 13)\n",
      "Country (or dependency)    False\n",
      "Population 2020            False\n",
      "Yearly Change              False\n",
      "Net Change                 False\n",
      "Density  (P/Km²)           False\n",
      "Land Area (Km²)            False\n",
      "Migrants (net)             False\n",
      "Fert. Rate                 False\n",
      "Med. Age                   False\n",
      "Urban Pop %                False\n",
      "World Share                False\n",
      "MigrantsFill               False\n",
      "dtype: bool\n"
     ]
    }
   ],
   "source": [
    "dropna_df = df.dropna()\n",
    "\n",
    "print(dropna_df.shape)\n",
    "\n",
    "print(fillna_df.isnull().any())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Создание выборок данных\n",
    "\n",
    "Библиотека scikit-learn\n",
    "\n",
    "https://scikit-learn.org/stable/index.html"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img src=\"assets/lec2-split.png\" width=\"600\" style=\"background-color: white\">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Функция для создания выборок\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "\n",
    "def split_stratified_into_train_val_test(\n",
    "    df_input,\n",
    "    stratify_colname=\"y\",\n",
    "    frac_train=0.6,\n",
    "    frac_val=0.15,\n",
    "    frac_test=0.25,\n",
    "    random_state=None,\n",
    "):\n",
    "    \"\"\"\n",
    "    Splits a Pandas dataframe into three subsets (train, val, and test)\n",
    "    following fractional ratios provided by the user, where each subset is\n",
    "    stratified by the values in a specific column (that is, each subset has\n",
    "    the same relative frequency of the values in the column). It performs this\n",
    "    splitting by running train_test_split() twice.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    df_input : Pandas dataframe\n",
    "        Input dataframe to be split.\n",
    "    stratify_colname : str\n",
    "        The name of the column that will be used for stratification. Usually\n",
    "        this column would be for the label.\n",
    "    frac_train : float\n",
    "    frac_val   : float\n",
    "    frac_test  : float\n",
    "        The ratios with which the dataframe will be split into train, val, and\n",
    "        test data. The values should be expressed as float fractions and should\n",
    "        sum to 1.0.\n",
    "    random_state : int, None, or RandomStateInstance\n",
    "        Value to be passed to train_test_split().\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    df_train, df_val, df_test :\n",
    "        Dataframes containing the three splits.\n",
    "    \"\"\"\n",
    "\n",
    "    if frac_train + frac_val + frac_test != 1.0:\n",
    "        raise ValueError(\n",
    "            \"fractions %f, %f, %f do not add up to 1.0\"\n",
    "            % (frac_train, frac_val, frac_test)\n",
    "        )\n",
    "\n",
    "    if stratify_colname not in df_input.columns:\n",
    "        raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
    "\n",
    "    X = df_input  # Contains all columns.\n",
    "    y = df_input[\n",
    "        [stratify_colname]\n",
    "    ]  # Dataframe of just the column on which to stratify.\n",
    "\n",
    "    # Split original dataframe into train and temp dataframes.\n",
    "    df_train, df_temp, y_train, y_temp = train_test_split(\n",
    "        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
    "    )\n",
    "\n",
    "    # Split the temp dataframe into val and test dataframes.\n",
    "    relative_frac_test = frac_test / (frac_val + frac_test)\n",
    "    df_val, df_test, y_val, y_test = train_test_split(\n",
    "        df_temp,\n",
    "        y_temp,\n",
    "        stratify=y_temp,\n",
    "        test_size=relative_frac_test,\n",
    "        random_state=random_state,\n",
    "    )\n",
    "\n",
    "    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
    "\n",
    "    return df_train, df_val, df_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MedAge\n",
      "N.A.    34\n",
      "19      14\n",
      "28      12\n",
      "43      11\n",
      "32      11\n",
      "42      10\n",
      "18      10\n",
      "20       9\n",
      "30       8\n",
      "38       7\n",
      "26       7\n",
      "40       7\n",
      "22       7\n",
      "31       6\n",
      "34       6\n",
      "24       6\n",
      "17       6\n",
      "44       5\n",
      "29       5\n",
      "41       5\n",
      "33       5\n",
      "21       5\n",
      "45       5\n",
      "23       4\n",
      "37       4\n",
      "36       4\n",
      "25       4\n",
      "27       4\n",
      "39       3\n",
      "46       3\n",
      "35       3\n",
      "47       2\n",
      "48       1\n",
      "15       1\n",
      "16       1\n",
      "Name: count, dtype: int64\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[17], line 6\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;28mprint\u001b[39m(df\u001b[38;5;241m.\u001b[39mMedAge\u001b[38;5;241m.\u001b[39mvalue_counts())\n\u001b[0;32m      4\u001b[0m data \u001b[38;5;241m=\u001b[39m df[[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMedAge\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFert. Rate\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDensity(P/Km²)\u001b[39m\u001b[38;5;124m\"\u001b[39m]]\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m----> 6\u001b[0m df_train, df_val, df_test \u001b[38;5;241m=\u001b[39m \u001b[43msplit_stratified_into_train_val_test\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m      7\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m      8\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstratify_colname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mMedAge\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m      9\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfrac_train\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.60\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m     10\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfrac_val\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m     11\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfrac_test\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m     12\u001b[0m \u001b[43m)\u001b[49m\n\u001b[0;32m     14\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_train\u001b[38;5;241m.\u001b[39mshape)\n\u001b[0;32m     15\u001b[0m \u001b[38;5;28mprint\u001b[39m(df_train\u001b[38;5;241m.\u001b[39mMedAge\u001b[38;5;241m.\u001b[39mvalue_counts())\n",
      "Cell \u001b[1;32mIn[16], line 57\u001b[0m, in \u001b[0;36msplit_stratified_into_train_val_test\u001b[1;34m(df_input, stratify_colname, frac_train, frac_val, frac_test, random_state)\u001b[0m\n\u001b[0;32m     52\u001b[0m y \u001b[38;5;241m=\u001b[39m df_input[\n\u001b[0;32m     53\u001b[0m     [stratify_colname]\n\u001b[0;32m     54\u001b[0m ]  \u001b[38;5;66;03m# Dataframe of just the column on which to stratify.\u001b[39;00m\n\u001b[0;32m     56\u001b[0m \u001b[38;5;66;03m# Split original dataframe into train and temp dataframes.\u001b[39;00m\n\u001b[1;32m---> 57\u001b[0m df_train, df_temp, y_train, y_temp \u001b[38;5;241m=\u001b[39m \u001b[43mtrain_test_split\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m     58\u001b[0m \u001b[43m    \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstratify\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m1.0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mfrac_train\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrandom_state\u001b[49m\n\u001b[0;32m     59\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     61\u001b[0m \u001b[38;5;66;03m# Split the temp dataframe into val and test dataframes.\u001b[39;00m\n\u001b[0;32m     62\u001b[0m relative_frac_test \u001b[38;5;241m=\u001b[39m frac_test \u001b[38;5;241m/\u001b[39m (frac_val \u001b[38;5;241m+\u001b[39m frac_test)\n",
      "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:213\u001b[0m, in \u001b[0;36mvalidate_params.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    207\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m    208\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m    209\u001b[0m         skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m    210\u001b[0m             prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m    211\u001b[0m         )\n\u001b[0;32m    212\u001b[0m     ):\n\u001b[1;32m--> 213\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    214\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m    215\u001b[0m     \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m    216\u001b[0m     \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m    217\u001b[0m     \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m    218\u001b[0m     \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m    219\u001b[0m     msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m    220\u001b[0m         \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m    221\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m    222\u001b[0m         \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m    223\u001b[0m     )\n",
      "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:2806\u001b[0m, in \u001b[0;36mtrain_test_split\u001b[1;34m(test_size, train_size, random_state, shuffle, stratify, *arrays)\u001b[0m\n\u001b[0;32m   2802\u001b[0m         CVClass \u001b[38;5;241m=\u001b[39m ShuffleSplit\n\u001b[0;32m   2804\u001b[0m     cv \u001b[38;5;241m=\u001b[39m CVClass(test_size\u001b[38;5;241m=\u001b[39mn_test, train_size\u001b[38;5;241m=\u001b[39mn_train, random_state\u001b[38;5;241m=\u001b[39mrandom_state)\n\u001b[1;32m-> 2806\u001b[0m     train, test \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcv\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43marrays\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstratify\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   2808\u001b[0m train, test \u001b[38;5;241m=\u001b[39m ensure_common_namespace_device(arrays[\u001b[38;5;241m0\u001b[39m], train, test)\n\u001b[0;32m   2810\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(\n\u001b[0;32m   2811\u001b[0m     chain\u001b[38;5;241m.\u001b[39mfrom_iterable(\n\u001b[0;32m   2812\u001b[0m         (_safe_indexing(a, train), _safe_indexing(a, test)) \u001b[38;5;28;01mfor\u001b[39;00m a \u001b[38;5;129;01min\u001b[39;00m arrays\n\u001b[0;32m   2813\u001b[0m     )\n\u001b[0;32m   2814\u001b[0m )\n",
      "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:1843\u001b[0m, in \u001b[0;36mBaseShuffleSplit.split\u001b[1;34m(self, X, y, groups)\u001b[0m\n\u001b[0;32m   1813\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Generate indices to split data into training and test set.\u001b[39;00m\n\u001b[0;32m   1814\u001b[0m \n\u001b[0;32m   1815\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   1840\u001b[0m \u001b[38;5;124;03mto an integer.\u001b[39;00m\n\u001b[0;32m   1841\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m   1842\u001b[0m X, y, groups \u001b[38;5;241m=\u001b[39m indexable(X, y, groups)\n\u001b[1;32m-> 1843\u001b[0m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_iter_indices\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgroups\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[0;32m   1844\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43;01myield\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\n",
      "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:2252\u001b[0m, in \u001b[0;36mStratifiedShuffleSplit._iter_indices\u001b[1;34m(self, X, y, groups)\u001b[0m\n\u001b[0;32m   2250\u001b[0m class_counts \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mbincount(y_indices)\n\u001b[0;32m   2251\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39mmin(class_counts) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m2\u001b[39m:\n\u001b[1;32m-> 2252\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m   2253\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe least populated class in y has only 1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   2254\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m member, which is too few. The minimum\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   2255\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m number of groups for any class cannot\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   2256\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m be less than 2.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   2257\u001b[0m     )\n\u001b[0;32m   2259\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n_train \u001b[38;5;241m<\u001b[39m n_classes:\n\u001b[0;32m   2260\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m   2261\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe train_size = \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m should be greater or \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   2262\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mequal to the number of classes = \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m (n_train, n_classes)\n\u001b[0;32m   2263\u001b[0m     )\n",
      "\u001b[1;31mValueError\u001b[0m: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2."
     ]
    }
   ],
   "source": [
    "# Вывод распределения количества наблюдений по меткам (классам)\n",
    "print(df.MedAge.value_counts())\n",
    "\n",
    "data = df[[\"MedAge\", \"Fert. Rate\", \"Density(P/Km²)\"]].copy()\n",
    "\n",
    "df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
    "    data,\n",
    "    stratify_colname=\"MedAge\",\n",
    "    frac_train=0.60,\n",
    "    frac_val=0.20,\n",
    "    frac_test=0.20,\n",
    ")\n",
    "\n",
    "print(\"Обучающая выборка: \", df_train.shape)\n",
    "print(df_train.MedAge.value_counts())\n",
    "\n",
    "print(\"Контрольная выборка: \", df_val.shape)\n",
    "print(df_val.MedAge.value_counts())\n",
    "\n",
    "print(\"Тестовая выборка: \", df_test.shape)\n",
    "print(df_test.MedAge.value_counts())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Выборка с избытком (oversampling)\n",
    "\n",
    "https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n",
    "\n",
    "https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n",
    "\n",
    "Выборка с недостатком (undersampling)\n",
    "\n",
    "https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n",
    "\n",
    "Библиотека imbalanced-learn\n",
    "\n",
    "https://imbalanced-learn.org/stable/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'imblearn'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[1], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mimblearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mover_sampling\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ADASYN\n\u001b[0;32m      3\u001b[0m ada \u001b[38;5;241m=\u001b[39m ADASYN()\n\u001b[0;32m      5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_train\u001b[38;5;241m.\u001b[39mshape)\n",
      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'imblearn'"
     ]
    }
   ],
   "source": [
    "from imblearn.over_sampling import ADASYN\n",
    "\n",
    "ada = ADASYN()\n",
    "\n",
    "print(\"Обучающая выборка: \", df_train.shape)\n",
    "print(df_train.Pclass.value_counts())\n",
    "\n",
    "X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Pclass\"])\n",
    "df_train_adasyn = pd.DataFrame(X_resampled)\n",
    "\n",
    "print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
    "print(df_train_adasyn.Pclass.value_counts())\n",
    "\n",
    "df_train_adasyn"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}