{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Загрузка данных в DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mRunning cells with 'Python 3.9.13' requires the ipykernel package.\n",
      "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n",
      "\u001b[1;31mCommand: 'c:/Users/ogoro/AppData/Local/Programs/Python/Python39/python.exe -m pip install ipykernel -U --user --force-reinstall'"
     ]
    }
   ],
   "source": [
    "from numpy import nan\n",
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"data/car_price_prediction.csv\", index_col=\"ID\")\n",
    "df[\"Leather_interior\"] = df[\"Leather_interior\"].replace({\"Yes\": 1, \"No\": 0})\n",
    "df[\"Levy\"] = df[\"Levy\"].replace({\"-\": None})\n",
    "\n",
    "df.info()\n",
    "print(df.shape)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Получение сведений о пропущенных данных"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Типы пропущенных данных:\n",
    "- None - представление пустых данных в Python\n",
    "- NaN - представление пустых данных в Pandas\n",
    "- '' - пустая строка"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Price                  0\n",
      "Levy                5819\n",
      "Manufacturer           0\n",
      "Model                  0\n",
      "Prod_year              0\n",
      "Category               0\n",
      "Leather_interior       0\n",
      "Fuel type              0\n",
      "Engine volume          0\n",
      "Mileage                0\n",
      "Cylinders              0\n",
      "Gear box type          0\n",
      "Drive wheels           0\n",
      "Doors                  0\n",
      "Wheel                  0\n",
      "Color                  0\n",
      "Airbags                0\n",
      "dtype: int64\n",
      "\n",
      "Price               False\n",
      "Levy                 True\n",
      "Manufacturer        False\n",
      "Model               False\n",
      "Prod_year           False\n",
      "Category            False\n",
      "Leather_interior    False\n",
      "Fuel type           False\n",
      "Engine volume       False\n",
      "Mileage             False\n",
      "Cylinders           False\n",
      "Gear box type       False\n",
      "Drive wheels        False\n",
      "Doors               False\n",
      "Wheel               False\n",
      "Color               False\n",
      "Airbags             False\n",
      "dtype: bool\n",
      "\n",
      "Levy процент пустых значений: %30.25\n"
     ]
    }
   ],
   "source": [
    "# Количество пустых значений признаков\n",
    "print(df.isnull().sum())\n",
    "\n",
    "print()\n",
    "\n",
    "# Есть ли пустые значения признаков\n",
    "print(df.isnull().any())\n",
    "\n",
    "print()\n",
    "\n",
    "# Процент пустых значений признаков\n",
    "for i in df.columns:\n",
    "    null_rate = df[i].isnull().sum() / len(df) * 100\n",
    "    if null_rate > 0:\n",
    "        print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Заполнение пропущенных данных\n",
    "\n",
    "https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n",
    "\n",
    "https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(19237, 17)\n",
      "Price               False\n",
      "Levy                False\n",
      "Manufacturer        False\n",
      "Model               False\n",
      "Prod_year           False\n",
      "Category            False\n",
      "Leather_interior    False\n",
      "Fuel type           False\n",
      "Engine volume       False\n",
      "Mileage             False\n",
      "Cylinders           False\n",
      "Gear box type       False\n",
      "Drive wheels        False\n",
      "Doors               False\n",
      "Wheel               False\n",
      "Color               False\n",
      "Airbags             False\n",
      "dtype: bool\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Price</th>\n",
       "      <th>Levy</th>\n",
       "      <th>Manufacturer</th>\n",
       "      <th>Model</th>\n",
       "      <th>Prod_year</th>\n",
       "      <th>Category</th>\n",
       "      <th>Leather_interior</th>\n",
       "      <th>Fuel type</th>\n",
       "      <th>Engine volume</th>\n",
       "      <th>Mileage</th>\n",
       "      <th>Cylinders</th>\n",
       "      <th>Gear box type</th>\n",
       "      <th>Drive wheels</th>\n",
       "      <th>Doors</th>\n",
       "      <th>Wheel</th>\n",
       "      <th>Color</th>\n",
       "      <th>Airbags</th>\n",
       "      <th>LevyFillNA</th>\n",
       "      <th>LevyFillMedian</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>45798355</th>\n",
       "      <td>8467</td>\n",
       "      <td>None</td>\n",
       "      <td>MERCEDES-BENZ</td>\n",
       "      <td>CLK 200</td>\n",
       "      <td>1999</td>\n",
       "      <td>Coupe</td>\n",
       "      <td>1</td>\n",
       "      <td>CNG</td>\n",
       "      <td>2.0 Turbo</td>\n",
       "      <td>300000 km</td>\n",
       "      <td>4.0</td>\n",
       "      <td>Manual</td>\n",
       "      <td>Rear</td>\n",
       "      <td>02-Mar</td>\n",
       "      <td>Left wheel</td>\n",
       "      <td>Silver</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>642.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45778856</th>\n",
       "      <td>15681</td>\n",
       "      <td>831</td>\n",
       "      <td>HYUNDAI</td>\n",
       "      <td>Sonata</td>\n",
       "      <td>2011</td>\n",
       "      <td>Sedan</td>\n",
       "      <td>1</td>\n",
       "      <td>Petrol</td>\n",
       "      <td>2.4</td>\n",
       "      <td>161600 km</td>\n",
       "      <td>4.0</td>\n",
       "      <td>Tiptronic</td>\n",
       "      <td>Front</td>\n",
       "      <td>04-May</td>\n",
       "      <td>Left wheel</td>\n",
       "      <td>Red</td>\n",
       "      <td>8</td>\n",
       "      <td>831</td>\n",
       "      <td>831</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45804997</th>\n",
       "      <td>26108</td>\n",
       "      <td>836</td>\n",
       "      <td>HYUNDAI</td>\n",
       "      <td>Tucson</td>\n",
       "      <td>2010</td>\n",
       "      <td>Jeep</td>\n",
       "      <td>1</td>\n",
       "      <td>Diesel</td>\n",
       "      <td>2</td>\n",
       "      <td>116365 km</td>\n",
       "      <td>4.0</td>\n",
       "      <td>Automatic</td>\n",
       "      <td>Front</td>\n",
       "      <td>04-May</td>\n",
       "      <td>Left wheel</td>\n",
       "      <td>Grey</td>\n",
       "      <td>4</td>\n",
       "      <td>836</td>\n",
       "      <td>836</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45793526</th>\n",
       "      <td>5331</td>\n",
       "      <td>1288</td>\n",
       "      <td>CHEVROLET</td>\n",
       "      <td>Captiva</td>\n",
       "      <td>2007</td>\n",
       "      <td>Jeep</td>\n",
       "      <td>1</td>\n",
       "      <td>Diesel</td>\n",
       "      <td>2</td>\n",
       "      <td>51258 km</td>\n",
       "      <td>4.0</td>\n",
       "      <td>Automatic</td>\n",
       "      <td>Front</td>\n",
       "      <td>04-May</td>\n",
       "      <td>Left wheel</td>\n",
       "      <td>Black</td>\n",
       "      <td>4</td>\n",
       "      <td>1288</td>\n",
       "      <td>1288</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45813273</th>\n",
       "      <td>470</td>\n",
       "      <td>753</td>\n",
       "      <td>HYUNDAI</td>\n",
       "      <td>Sonata</td>\n",
       "      <td>2012</td>\n",
       "      <td>Sedan</td>\n",
       "      <td>1</td>\n",
       "      <td>Hybrid</td>\n",
       "      <td>2.4</td>\n",
       "      <td>186923 km</td>\n",
       "      <td>4.0</td>\n",
       "      <td>Automatic</td>\n",
       "      <td>Front</td>\n",
       "      <td>04-May</td>\n",
       "      <td>Left wheel</td>\n",
       "      <td>White</td>\n",
       "      <td>12</td>\n",
       "      <td>753</td>\n",
       "      <td>753</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          Price  Levy   Manufacturer    Model  Prod_year Category  \\\n",
       "ID                                                                  \n",
       "45798355   8467  None  MERCEDES-BENZ  CLK 200       1999    Coupe   \n",
       "45778856  15681   831        HYUNDAI   Sonata       2011    Sedan   \n",
       "45804997  26108   836        HYUNDAI   Tucson       2010     Jeep   \n",
       "45793526   5331  1288      CHEVROLET  Captiva       2007     Jeep   \n",
       "45813273    470   753        HYUNDAI   Sonata       2012    Sedan   \n",
       "\n",
       "          Leather_interior Fuel type Engine volume    Mileage  Cylinders  \\\n",
       "ID                                                                         \n",
       "45798355                 1       CNG     2.0 Turbo  300000 km        4.0   \n",
       "45778856                 1    Petrol           2.4  161600 km        4.0   \n",
       "45804997                 1    Diesel             2  116365 km        4.0   \n",
       "45793526                 1    Diesel             2   51258 km        4.0   \n",
       "45813273                 1    Hybrid           2.4  186923 km        4.0   \n",
       "\n",
       "         Gear box type Drive wheels   Doors       Wheel   Color  Airbags  \\\n",
       "ID                                                                         \n",
       "45798355        Manual         Rear  02-Mar  Left wheel  Silver        5   \n",
       "45778856     Tiptronic        Front  04-May  Left wheel     Red        8   \n",
       "45804997     Automatic        Front  04-May  Left wheel    Grey        4   \n",
       "45793526     Automatic        Front  04-May  Left wheel   Black        4   \n",
       "45813273     Automatic        Front  04-May  Left wheel   White       12   \n",
       "\n",
       "         LevyFillNA LevyFillMedian  \n",
       "ID                                  \n",
       "45798355          0          642.0  \n",
       "45778856        831            831  \n",
       "45804997        836            836  \n",
       "45793526       1288           1288  \n",
       "45813273        753            753  "
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fillna_df = df.fillna(0)\n",
    "\n",
    "print(fillna_df.shape)\n",
    "\n",
    "print(fillna_df.isnull().any())\n",
    "\n",
    "# Замена пустых данных на 0\n",
    "df[\"LevyFillNA\"] = df[\"Levy\"].fillna(0)\n",
    "\n",
    "# Замена пустых данных на медиану\n",
    "df[\"LevyFillMedian\"] = df[\"Levy\"].fillna(df[\"LevyFillNA\"].median())\n",
    "\n",
    "df.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Price</th>\n",
       "      <th>Levy</th>\n",
       "      <th>Manufacturer</th>\n",
       "      <th>Model</th>\n",
       "      <th>Prod. year</th>\n",
       "      <th>Category</th>\n",
       "      <th>Leather_interior</th>\n",
       "      <th>Fuel type</th>\n",
       "      <th>Engine volume</th>\n",
       "      <th>Mileage</th>\n",
       "      <th>Cylinders</th>\n",
       "      <th>Gear box type</th>\n",
       "      <th>Drive wheels</th>\n",
       "      <th>Doors</th>\n",
       "      <th>Wheel</th>\n",
       "      <th>Color</th>\n",
       "      <th>Airbags</th>\n",
       "      <th>LevyFillNA</th>\n",
       "      <th>LevyFillMedian</th>\n",
       "      <th>LevyCopy</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>45798355</th>\n",
       "      <td>8467</td>\n",
       "      <td>None</td>\n",
       "      <td>MERCEDES-BENZ</td>\n",
       "      <td>CLK 200</td>\n",
       "      <td>1999</td>\n",
       "      <td>Coupe</td>\n",
       "      <td>1</td>\n",
       "      <td>CNG</td>\n",
       "      <td>2.0 Turbo</td>\n",
       "      <td>300000 km</td>\n",
       "      <td>4.0</td>\n",
       "      <td>Manual</td>\n",
       "      <td>Rear</td>\n",
       "      <td>02-Mar</td>\n",
       "      <td>Left wheel</td>\n",
       "      <td>Silver</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>642.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45778856</th>\n",
       "      <td>15681</td>\n",
       "      <td>831</td>\n",
       "      <td>HYUNDAI</td>\n",
       "      <td>Sonata</td>\n",
       "      <td>2011</td>\n",
       "      <td>Sedan</td>\n",
       "      <td>1</td>\n",
       "      <td>Petrol</td>\n",
       "      <td>2.4</td>\n",
       "      <td>161600 km</td>\n",
       "      <td>4.0</td>\n",
       "      <td>Tiptronic</td>\n",
       "      <td>Front</td>\n",
       "      <td>04-May</td>\n",
       "      <td>Left wheel</td>\n",
       "      <td>Red</td>\n",
       "      <td>8</td>\n",
       "      <td>831</td>\n",
       "      <td>831</td>\n",
       "      <td>831</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45804997</th>\n",
       "      <td>26108</td>\n",
       "      <td>836</td>\n",
       "      <td>HYUNDAI</td>\n",
       "      <td>Tucson</td>\n",
       "      <td>2010</td>\n",
       "      <td>Jeep</td>\n",
       "      <td>1</td>\n",
       "      <td>Diesel</td>\n",
       "      <td>2</td>\n",
       "      <td>116365 km</td>\n",
       "      <td>4.0</td>\n",
       "      <td>Automatic</td>\n",
       "      <td>Front</td>\n",
       "      <td>04-May</td>\n",
       "      <td>Left wheel</td>\n",
       "      <td>Grey</td>\n",
       "      <td>4</td>\n",
       "      <td>836</td>\n",
       "      <td>836</td>\n",
       "      <td>836</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45793526</th>\n",
       "      <td>5331</td>\n",
       "      <td>1288</td>\n",
       "      <td>CHEVROLET</td>\n",
       "      <td>Captiva</td>\n",
       "      <td>2007</td>\n",
       "      <td>Jeep</td>\n",
       "      <td>1</td>\n",
       "      <td>Diesel</td>\n",
       "      <td>2</td>\n",
       "      <td>51258 km</td>\n",
       "      <td>4.0</td>\n",
       "      <td>Automatic</td>\n",
       "      <td>Front</td>\n",
       "      <td>04-May</td>\n",
       "      <td>Left wheel</td>\n",
       "      <td>Black</td>\n",
       "      <td>4</td>\n",
       "      <td>1288</td>\n",
       "      <td>1288</td>\n",
       "      <td>1288</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45813273</th>\n",
       "      <td>470</td>\n",
       "      <td>753</td>\n",
       "      <td>HYUNDAI</td>\n",
       "      <td>Sonata</td>\n",
       "      <td>2012</td>\n",
       "      <td>Sedan</td>\n",
       "      <td>1</td>\n",
       "      <td>Hybrid</td>\n",
       "      <td>2.4</td>\n",
       "      <td>186923 km</td>\n",
       "      <td>4.0</td>\n",
       "      <td>Automatic</td>\n",
       "      <td>Front</td>\n",
       "      <td>04-May</td>\n",
       "      <td>Left wheel</td>\n",
       "      <td>White</td>\n",
       "      <td>12</td>\n",
       "      <td>753</td>\n",
       "      <td>753</td>\n",
       "      <td>753</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          Price  Levy   Manufacturer    Model  Prod. year Category  \\\n",
       "ID                                                                   \n",
       "45798355   8467  None  MERCEDES-BENZ  CLK 200        1999    Coupe   \n",
       "45778856  15681   831        HYUNDAI   Sonata        2011    Sedan   \n",
       "45804997  26108   836        HYUNDAI   Tucson        2010     Jeep   \n",
       "45793526   5331  1288      CHEVROLET  Captiva        2007     Jeep   \n",
       "45813273    470   753        HYUNDAI   Sonata        2012    Sedan   \n",
       "\n",
       "          Leather_interior Fuel type Engine volume    Mileage  Cylinders  \\\n",
       "ID                                                                         \n",
       "45798355                 1       CNG     2.0 Turbo  300000 km        4.0   \n",
       "45778856                 1    Petrol           2.4  161600 km        4.0   \n",
       "45804997                 1    Diesel             2  116365 km        4.0   \n",
       "45793526                 1    Diesel             2   51258 km        4.0   \n",
       "45813273                 1    Hybrid           2.4  186923 km        4.0   \n",
       "\n",
       "         Gear box type Drive wheels   Doors       Wheel   Color  Airbags  \\\n",
       "ID                                                                         \n",
       "45798355        Manual         Rear  02-Mar  Left wheel  Silver        5   \n",
       "45778856     Tiptronic        Front  04-May  Left wheel     Red        8   \n",
       "45804997     Automatic        Front  04-May  Left wheel    Grey        4   \n",
       "45793526     Automatic        Front  04-May  Left wheel   Black        4   \n",
       "45813273     Automatic        Front  04-May  Left wheel   White       12   \n",
       "\n",
       "         LevyFillNA LevyFillMedian LevyCopy  \n",
       "ID                                           \n",
       "45798355          0          642.0        0  \n",
       "45778856        831            831      831  \n",
       "45804997        836            836      836  \n",
       "45793526       1288           1288     1288  \n",
       "45813273        753            753      753  "
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"LevyCopy\"] = df[\"Levy\"]\n",
    "\n",
    "# Замена данных сразу в DataFrame без копирования\n",
    "df.fillna({\"LevyCopy\": 0}, inplace=True)\n",
    "\n",
    "df.tail()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Удаление наблюдений с пропусками"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(13418, 20)\n",
      "Price               False\n",
      "Levy                False\n",
      "Manufacturer        False\n",
      "Model               False\n",
      "Prod. year          False\n",
      "Category            False\n",
      "Leather_interior    False\n",
      "Fuel type           False\n",
      "Engine volume       False\n",
      "Mileage             False\n",
      "Cylinders           False\n",
      "Gear box type       False\n",
      "Drive wheels        False\n",
      "Doors               False\n",
      "Wheel               False\n",
      "Color               False\n",
      "Airbags             False\n",
      "dtype: bool\n"
     ]
    }
   ],
   "source": [
    "dropna_df = df.dropna()\n",
    "\n",
    "print(dropna_df.shape)\n",
    "\n",
    "print(fillna_df.isnull().any())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Создание выборок данных\n",
    "\n",
    "Библиотека scikit-learn\n",
    "\n",
    "https://scikit-learn.org/stable/index.html"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img src=\"assets/lec2-split.png\" width=\"600\" style=\"background-color: white\">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Функция для создания выборок\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "\n",
    "def split_stratified_into_train_val_test(\n",
    "    df_input,\n",
    "    stratify_colname=\"y\",\n",
    "    frac_train=0.6,\n",
    "    frac_val=0.15,\n",
    "    frac_test=0.25,\n",
    "    random_state=None,\n",
    "):\n",
    "    \"\"\"\n",
    "    Splits a Pandas dataframe into three subsets (train, val, and test)\n",
    "    following fractional ratios provided by the user, where each subset is\n",
    "    stratified by the values in a specific column (that is, each subset has\n",
    "    the same relative frequency of the values in the column). It performs this\n",
    "    splitting by running train_test_split() twice.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    df_input : Pandas dataframe\n",
    "        Input dataframe to be split.\n",
    "    stratify_colname : str\n",
    "        The name of the column that will be used for stratification. Usually\n",
    "        this column would be for the label.\n",
    "    frac_train : float\n",
    "    frac_val   : float\n",
    "    frac_test  : float\n",
    "        The ratios with which the dataframe will be split into train, val, and\n",
    "        test data. The values should be expressed as float fractions and should\n",
    "        sum to 1.0.\n",
    "    random_state : int, None, or RandomStateInstance\n",
    "        Value to be passed to train_test_split().\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    df_train, df_val, df_test :\n",
    "        Dataframes containing the three splits.\n",
    "    \"\"\"\n",
    "\n",
    "    if frac_train + frac_val + frac_test != 1.0:\n",
    "        raise ValueError(\n",
    "            \"fractions %f, %f, %f do not add up to 1.0\"\n",
    "            % (frac_train, frac_val, frac_test)\n",
    "        )\n",
    "\n",
    "    if stratify_colname not in df_input.columns:\n",
    "        raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
    "\n",
    "    X = df_input  # Contains all columns.\n",
    "    y = df_input[\n",
    "        [stratify_colname]\n",
    "    ]  # Dataframe of just the column on which to stratify.\n",
    "\n",
    "    # Split original dataframe into train and temp dataframes.\n",
    "    df_train, df_temp, y_train, y_temp = train_test_split(\n",
    "        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
    "    )\n",
    "\n",
    "    # Split the temp dataframe into val and test dataframes.\n",
    "    relative_frac_test = frac_test / (frac_val + frac_test)\n",
    "    df_val, df_test, y_val, y_test = train_test_split(\n",
    "        df_temp,\n",
    "        y_temp,\n",
    "        stratify=y_temp,\n",
    "        test_size=relative_frac_test,\n",
    "        random_state=random_state,\n",
    "    )\n",
    "\n",
    "    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
    "\n",
    "    return df_train, df_val, df_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Leather_interior\n",
      "1    13954\n",
      "0     5283\n",
      "Name: count, dtype: int64\n",
      "Обучающая выборка:  (11542, 3)\n",
      "Leather_interior\n",
      "1    8372\n",
      "0    3170\n",
      "Name: count, dtype: int64\n",
      "Контрольная выборка:  (3847, 3)\n",
      "Leather_interior\n",
      "1    2791\n",
      "0    1056\n",
      "Name: count, dtype: int64\n",
      "Тестовая выборка:  (3848, 3)\n",
      "Leather_interior\n",
      "1    2791\n",
      "0    1057\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# Вывод распределения количества наблюдений по меткам (классам)\n",
    "print(df.Leather_interior.value_counts())\n",
    "\n",
    "data = df[[\"Leather_interior\", \"Price\", \"Prod_year\"]].copy()\n",
    "\n",
    "df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
    "    data,\n",
    "    stratify_colname=\"Leather_interior\",\n",
    "    frac_train=0.60,\n",
    "    frac_val=0.20,\n",
    "    frac_test=0.20,\n",
    ")\n",
    "\n",
    "print(\"Обучающая выборка: \", df_train.shape)\n",
    "print(df_train.Leather_interior.value_counts())\n",
    "\n",
    "print(\"Контрольная выборка: \", df_val.shape)\n",
    "print(df_val.Leather_interior.value_counts())\n",
    "\n",
    "print(\"Тестовая выборка: \", df_test.shape)\n",
    "print(df_test.Leather_interior.value_counts())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Выборка с избытком (oversampling)\n",
    "\n",
    "https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n",
    "\n",
    "https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n",
    "\n",
    "Выборка с недостатком (undersampling)\n",
    "\n",
    "https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n",
    "\n",
    "Библиотека imbalanced-learn\n",
    "\n",
    "https://imbalanced-learn.org/stable/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Обучающая выборка:  (11542, 3)\n",
      "Leather_interior\n",
      "1    8372\n",
      "0    3170\n",
      "Name: count, dtype: int64\n",
      "Обучающая выборка после oversampling:  (16416, 3)\n",
      "Leather_interior\n",
      "1    8372\n",
      "0    8044\n",
      "Name: count, dtype: int64\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Leather_interior</th>\n",
       "      <th>Price</th>\n",
       "      <th>Prod_year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>21400</td>\n",
       "      <td>2008</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>16621</td>\n",
       "      <td>2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>28852</td>\n",
       "      <td>2017</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>2430</td>\n",
       "      <td>2008</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>7840</td>\n",
       "      <td>2005</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16411</th>\n",
       "      <td>0</td>\n",
       "      <td>26030</td>\n",
       "      <td>2013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16412</th>\n",
       "      <td>0</td>\n",
       "      <td>26030</td>\n",
       "      <td>2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16413</th>\n",
       "      <td>0</td>\n",
       "      <td>26030</td>\n",
       "      <td>2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16414</th>\n",
       "      <td>0</td>\n",
       "      <td>26030</td>\n",
       "      <td>2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16415</th>\n",
       "      <td>0</td>\n",
       "      <td>26030</td>\n",
       "      <td>2012</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>16416 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       Leather_interior  Price  Prod_year\n",
       "0                     0  21400       2008\n",
       "1                     0  16621       2016\n",
       "2                     1  28852       2017\n",
       "3                     1   2430       2008\n",
       "4                     0   7840       2005\n",
       "...                 ...    ...        ...\n",
       "16411                 0  26030       2013\n",
       "16412                 0  26030       2012\n",
       "16413                 0  26030       2014\n",
       "16414                 0  26030       2012\n",
       "16415                 0  26030       2012\n",
       "\n",
       "[16416 rows x 3 columns]"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from imblearn.over_sampling import ADASYN\n",
    "\n",
    "ada = ADASYN()\n",
    "\n",
    "print(\"Обучающая выборка: \", df_train.shape)\n",
    "print(df_train.Leather_interior.value_counts())\n",
    "\n",
    "X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Leather_interior\"])\n",
    "df_train_adasyn = pd.DataFrame(X_resampled)\n",
    "\n",
    "print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
    "print(df_train_adasyn.Leather_interior.value_counts())\n",
    "\n",
    "df_train_adasyn"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}