AIM-PIbd-31-LOBASHOV-I-D/lab_2/lab_2.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Lab2 PIbd-31 Lobashov\n",
    "Три датасета:\n",
    "1. Цена на автомобили (17 вариант)\n",
    "2. Магазины (9 вариант)\n",
    "3. Цены на золото (14 вариант)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 19237 entries, 0 to 19236\n",
      "Data columns (total 18 columns):\n",
      " #   Column            Non-Null Count  Dtype  \n",
      "---  ------            --------------  -----  \n",
      " 0   ID                19237 non-null  int64  \n",
      " 1   Price             19237 non-null  int64  \n",
      " 2   Levy              19237 non-null  int64  \n",
      " 3   Manufacturer      19237 non-null  object \n",
      " 4   Model             19237 non-null  object \n",
      " 5   Prod. year        19237 non-null  int64  \n",
      " 6   Category          19237 non-null  object \n",
      " 7   Leather interior  19237 non-null  object \n",
      " 8   Fuel type         19237 non-null  object \n",
      " 9   Engine volume     19237 non-null  object \n",
      " 10  Mileage           19237 non-null  int64  \n",
      " 11  Cylinders         19237 non-null  float64\n",
      " 12  Gear box type     19237 non-null  object \n",
      " 13  Drive wheels      19237 non-null  object \n",
      " 14  Doors             19237 non-null  object \n",
      " 15  Wheel             19237 non-null  object \n",
      " 16  Color             19237 non-null  object \n",
      " 17  Airbags           19237 non-null  int64  \n",
      "dtypes: float64(1), int64(6), object(11)\n",
      "memory usage: 2.6+ MB\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 896 entries, 0 to 895\n",
      "Data columns (total 5 columns):\n",
      " #   Column                Non-Null Count  Dtype\n",
      "---  ------                --------------  -----\n",
      " 0   Store ID              896 non-null    int64\n",
      " 1   Store_Area            896 non-null    int64\n",
      " 2   Items_Available       896 non-null    int64\n",
      " 3   Daily_Customer_Count  896 non-null    int64\n",
      " 4   Store_Sales           896 non-null    int64\n",
      "dtypes: int64(5)\n",
      "memory usage: 35.1 KB\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1718 entries, 0 to 1717\n",
      "Data columns (total 81 columns):\n",
      " #   Column         Non-Null Count  Dtype  \n",
      "---  ------         --------------  -----  \n",
      " 0   Date           1718 non-null   object \n",
      " 1   Open           1718 non-null   float64\n",
      " 2   High           1718 non-null   float64\n",
      " 3   Low            1718 non-null   float64\n",
      " 4   Close          1718 non-null   float64\n",
      " 5   Adj Close      1718 non-null   float64\n",
      " 6   Volume         1718 non-null   int64  \n",
      " 7   SP_open        1718 non-null   float64\n",
      " 8   SP_high        1718 non-null   float64\n",
      " 9   SP_low         1718 non-null   float64\n",
      " 10  SP_close       1718 non-null   float64\n",
      " 11  SP_Ajclose     1718 non-null   float64\n",
      " 12  SP_volume      1718 non-null   int64  \n",
      " 13  DJ_open        1718 non-null   float64\n",
      " 14  DJ_high        1718 non-null   float64\n",
      " 15  DJ_low         1718 non-null   float64\n",
      " 16  DJ_close       1718 non-null   float64\n",
      " 17  DJ_Ajclose     1718 non-null   float64\n",
      " 18  DJ_volume      1718 non-null   int64  \n",
      " 19  EG_open        1718 non-null   float64\n",
      " 20  EG_high        1718 non-null   float64\n",
      " 21  EG_low         1718 non-null   float64\n",
      " 22  EG_close       1718 non-null   float64\n",
      " 23  EG_Ajclose     1718 non-null   float64\n",
      " 24  EG_volume      1718 non-null   int64  \n",
      " 25  EU_Price       1718 non-null   float64\n",
      " 26  EU_open        1718 non-null   float64\n",
      " 27  EU_high        1718 non-null   float64\n",
      " 28  EU_low         1718 non-null   float64\n",
      " 29  EU_Trend       1718 non-null   int64  \n",
      " 30  OF_Price       1718 non-null   float64\n",
      " 31  OF_Open        1718 non-null   float64\n",
      " 32  OF_High        1718 non-null   float64\n",
      " 33  OF_Low         1718 non-null   float64\n",
      " 34  OF_Volume      1718 non-null   int64  \n",
      " 35  OF_Trend       1718 non-null   int64  \n",
      " 36  OS_Price       1718 non-null   float64\n",
      " 37  OS_Open        1718 non-null   float64\n",
      " 38  OS_High        1718 non-null   float64\n",
      " 39  OS_Low         1718 non-null   float64\n",
      " 40  OS_Trend       1718 non-null   int64  \n",
      " 41  SF_Price       1718 non-null   int64  \n",
      " 42  SF_Open        1718 non-null   int64  \n",
      " 43  SF_High        1718 non-null   int64  \n",
      " 44  SF_Low         1718 non-null   int64  \n",
      " 45  SF_Volume      1718 non-null   int64  \n",
      " 46  SF_Trend       1718 non-null   int64  \n",
      " 47  USB_Price      1718 non-null   float64\n",
      " 48  USB_Open       1718 non-null   float64\n",
      " 49  USB_High       1718 non-null   float64\n",
      " 50  USB_Low        1718 non-null   float64\n",
      " 51  USB_Trend      1718 non-null   int64  \n",
      " 52  PLT_Price      1718 non-null   float64\n",
      " 53  PLT_Open       1718 non-null   float64\n",
      " 54  PLT_High       1718 non-null   float64\n",
      " 55  PLT_Low        1718 non-null   float64\n",
      " 56  PLT_Trend      1718 non-null   int64  \n",
      " 57  PLD_Price      1718 non-null   float64\n",
      " 58  PLD_Open       1718 non-null   float64\n",
      " 59  PLD_High       1718 non-null   float64\n",
      " 60  PLD_Low        1718 non-null   float64\n",
      " 61  PLD_Trend      1718 non-null   int64  \n",
      " 62  RHO_PRICE      1718 non-null   int64  \n",
      " 63  USDI_Price     1718 non-null   float64\n",
      " 64  USDI_Open      1718 non-null   float64\n",
      " 65  USDI_High      1718 non-null   float64\n",
      " 66  USDI_Low       1718 non-null   float64\n",
      " 67  USDI_Volume    1718 non-null   int64  \n",
      " 68  USDI_Trend     1718 non-null   int64  \n",
      " 69  GDX_Open       1718 non-null   float64\n",
      " 70  GDX_High       1718 non-null   float64\n",
      " 71  GDX_Low        1718 non-null   float64\n",
      " 72  GDX_Close      1718 non-null   float64\n",
      " 73  GDX_Adj Close  1718 non-null   float64\n",
      " 74  GDX_Volume     1718 non-null   int64  \n",
      " 75  USO_Open       1718 non-null   float64\n",
      " 76  USO_High       1718 non-null   float64\n",
      " 77  USO_Low        1718 non-null   float64\n",
      " 78  USO_Close      1718 non-null   float64\n",
      " 79  USO_Adj Close  1718 non-null   float64\n",
      " 80  USO_Volume     1718 non-null   int64  \n",
      "dtypes: float64(58), int64(22), object(1)\n",
      "memory usage: 1.1+ MB\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"..\\\\static\\\\csv\\\\car_price_prediction.csv\")\n",
    "df2 = pd.read_csv(\"..\\\\static\\\\csv\\\\Stores.csv\")\n",
    "df3 = pd.read_csv(\"..\\\\static\\\\csv\\\\FINAL_USO.csv\")\n",
    "df.info()\n",
    "df2.info()\n",
    "df3.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Проблемная область\n",
    "Первый датасет позволяет проанализировать данные и понять, какие автомобили имеют превосходство на рынке и какие чаще всего выбирают пользователи.\n",
    "Второй датасет позволяет при помощи данных о магазинах проанализировать их производительность и выявить факторы, влияющие на продажи.\n",
    "Третий датасет позволяет проанализировать данные и спрогнозировать цены на золото на основе различных финансовых показателей.\n",
    "#### Анализ набора данных\n",
    "Объекты наблюдения - автомобили, магазины, цены на золото\n",
    "Атрибуты - \n",
    "1. ID, Цена, Налог, Производитель, Модель, Год производства, Категория, Кожаный салон, Тип топлива, Объем двигателя, Пробег, Цилиндры, Тип коробки передач, Приводные колеса, Количество дверей, Руль, Цвет, Подушки безопасности.\n",
    "2. ID магазина, Площадь магазина, Доступные товары, Ежедневное количество покупателей, Продажи магазина.\n",
    "3. Дата, Открытие, Максимум, Минимум, Закрытие, Скорректированное закрытие, Объем торгов, SP_открытие, SP_максимум, SP_минимум, SP_закрытие, SP_скорректированное закрытие, SP_объем, DJ_открытие, DJ_максимум, DJ_минимум, DJ_закрытие, DJ_скорректированное закрытие, DJ_объем, EG_открытие, EG_максимум, EG_минимум, EG_закрытие, EG_скорректированное закрытие, EG_объем, EU_Цена, EU_открытие, EU_максимум, EU_минимум, EU_тренд, OF_Цена, OF_Открытие, OF_Максимум, OF_Минимум, OF_Объем, OF_Тренд, OS_Цена, OS_Открытие, OS_Максимум, OS_Минимум, OS_Тренд, SF_Цена, SF_Открытие, SF_Максимум, SF_Минимум, SF_Объем, SF_Тренд, USB_Цена, USB_Открытие, USB_Максимум, USB_Минимум, USB_Тренд, PLT_Цена, PLT_Открытие, PLT_Максимум, PLT_Минимум, PLT_Тренд, PLD_Цена, PLD_Открытие, PLD_Максимум, PLD_Минимум, PLD_Тренд, RHO_Цена, USDI_Цена, USDI_Открытие, USDI_Максимум, USDI_Минимум, USDI_Объем, USDI_Тренд, GDX_Открытие, GDX_Максимум, GDX_Минимум, GDX_Закрытие, GDX_Скорректированное закрытие, GDX_Объем, USO_Открытие, USO_Максимум, USO_Минимум, USO_Закрытие, USO_Скорректированное закрытие, USO_Объем. Связи между объектами - нет\n",
    "Связи между объектами - нет\n",
    "#### Бизнес-цели\n",
    "1. Какие модели автомобилей более выгодно продавать магазинам техники. Какие комплектующие наиболее популярны у производителей и покупателей, для дальнейшего увеличения производства данных комплектующих.\n",
    "2. Анализ производительности магазинов для выявления факторов, влияющих на продажи, и оптимизация работы магазинов.\n",
    "3. Прогноз цен на золото для принятия инвестиционных решений и управления рисками.\n",
    "#### Примеры целей технического проекта. Что поступает на вход, что является целевым признаком.\n",
    "На входе всегда датасет, целевые признаки:\n",
    "1. Цена автомобиля (Price)\n",
    "2. Продажи магазина (Store_Sales)\n",
    "3. Цена закрытия золота (Close)\n",
    "#### Проблемы набора данных и их решения\n",
    "1. Возможны устаревшие данные. Для решения данной проблемы требуется удаление самых старых записей и добавление более новых.\n",
    "2. Возможны выбросы. Решить эту проблему помогает удаление таких выбросов. В маленькой выборке не рекомендуется удалять выбросы. В большой выборке выбросы усредняются.\n",
    "#### Качество набора данных\n",
    "Наборы данных содержат достаточно примеров и признаков для обучения модели. Учтены различные ситуации проблемной области. Данные соответствуют данным, которые будут подаваться в производственной среде. Все метки согласованы.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Поиск аномалий"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                 ID         Price          Levy    Prod. year       Mileage  \\\n",
      "count  1.923700e+04  1.923700e+04  19237.000000  19237.000000  1.923700e+04   \n",
      "mean   4.557654e+07  1.855593e+04    935.018662   2010.912824  1.532236e+06   \n",
      "std    9.365914e+05  1.905813e+05    388.099990      5.668673  4.840387e+07   \n",
      "min    2.074688e+07  1.000000e+00     87.000000   1939.000000  0.000000e+00   \n",
      "25%    4.569837e+07  5.331000e+03    730.000000   2009.000000  7.013900e+04   \n",
      "50%    4.577231e+07  1.317200e+04   1000.000000   2012.000000  1.260000e+05   \n",
      "75%    4.580204e+07  2.207500e+04   1000.000000   2015.000000  1.888880e+05   \n",
      "max    4.581665e+07  2.630750e+07  11714.000000   2020.000000  2.147484e+09   \n",
      "\n",
      "          Cylinders       Airbags  \n",
      "count  19237.000000  19237.000000  \n",
      "mean       4.582991      6.582627  \n",
      "std        1.199933      4.320168  \n",
      "min        1.000000      0.000000  \n",
      "25%        4.000000      4.000000  \n",
      "50%        4.000000      6.000000  \n",
      "75%        4.000000     12.000000  \n",
      "max       16.000000     16.000000  \n",
      "        Store ID    Store_Area  Items_Available  Daily_Customer_Count  \\\n",
      "count  896.000000   896.000000       896.000000            896.000000   \n",
      "mean   448.500000  1485.409598      1782.035714            786.350446   \n",
      "std    258.797218   250.237011       299.872053            265.389281   \n",
      "min      1.000000   775.000000       932.000000             10.000000   \n",
      "25%    224.750000  1316.750000      1575.500000            600.000000   \n",
      "50%    448.500000  1477.000000      1773.500000            780.000000   \n",
      "75%    672.250000  1653.500000      1982.750000            970.000000   \n",
      "max    896.000000  2229.000000      2667.000000           1560.000000   \n",
      "\n",
      "         Store_Sales  \n",
      "count     896.000000  \n",
      "mean    59351.305804  \n",
      "std     17190.741895  \n",
      "min     14920.000000  \n",
      "25%     46530.000000  \n",
      "50%     58605.000000  \n",
      "75%     71872.500000  \n",
      "max    116320.000000  \n",
      "              Open         High          Low        Close    Adj Close  \\\n",
      "count  1718.000000  1718.000000  1718.000000  1718.000000  1718.000000   \n",
      "mean    127.323434   127.854237   126.777695   127.319482   127.319482   \n",
      "std      17.526993    17.631189    17.396513    17.536269    17.536269   \n",
      "min     100.919998   100.989998   100.230003   100.500000   100.500000   \n",
      "25%     116.220001   116.540001   115.739998   116.052502   116.052502   \n",
      "50%     121.915001   122.325001   121.369999   121.795002   121.795002   \n",
      "75%     128.427494   129.087498   127.840001   128.470001   128.470001   \n",
      "max     173.199997   174.070007   172.919998   173.610001   173.610001   \n",
      "\n",
      "             Volume      SP_open      SP_high       SP_low     SP_close  ...  \\\n",
      "count  1.718000e+03  1718.000000  1718.000000  1718.000000  1718.000000  ...   \n",
      "mean   8.446327e+06   204.490023   205.372637   203.487014   204.491222  ...   \n",
      "std    4.920731e+06    43.831928    43.974644    43.618940    43.776999  ...   \n",
      "min    1.501600e+06   122.059998   122.320000   120.029999   120.290001  ...   \n",
      "25%    5.412925e+06   170.392498   170.962506   169.577499   170.397500  ...   \n",
      "50%    7.483900e+06   205.464996   206.459999   204.430000   205.529999  ...   \n",
      "75%    1.020795e+07   237.292500   237.722500   236.147503   236.889996  ...   \n",
      "max    9.380420e+07   293.089996   293.940002   291.809998   293.579987  ...   \n",
      "\n",
      "           GDX_Low    GDX_Close  GDX_Adj Close    GDX_Volume     USO_Open  \\\n",
      "count  1718.000000  1718.000000    1718.000000  1.718000e+03  1718.000000   \n",
      "mean     26.384575    26.715012      25.924624  4.356515e+07    22.113417   \n",
      "std      10.490908    10.603110       9.886570  2.909151e+07    11.431056   \n",
      "min      12.400000    12.470000      12.269618  4.729000e+06     7.820000   \n",
      "25%      20.355000    20.585000      20.180950  2.259968e+07    11.420000   \n",
      "50%      22.870001    23.054999      22.677604  3.730465e+07    16.450000   \n",
      "75%      26.797500    27.317500      26.478154  5.697055e+07    34.419998   \n",
      "max      56.770000    57.470001      54.617039  2.321536e+08    41.599998   \n",
      "\n",
      "          USO_High      USO_Low    USO_Close  USO_Adj Close    USO_Volume  \n",
      "count  1718.000000  1718.000000  1718.000000    1718.000000  1.718000e+03  \n",
      "mean     22.307148    21.904657    22.109051      22.109051  1.922313e+07  \n",
      "std      11.478671    11.373997    11.432787      11.432787  1.575743e+07  \n",
      "min       8.030000     7.670000     7.960000       7.960000  1.035100e+06  \n",
      "25%      11.500000    11.300000    11.392500      11.392500  6.229500e+06  \n",
      "50%      16.635001    16.040000    16.345000      16.345000  1.613015e+07  \n",
      "75%      34.667499    34.110000    34.417499      34.417499  2.672375e+07  \n",
      "max      42.299999    41.299999    42.009998      42.009998  1.102657e+08  \n",
      "\n",
      "[8 rows x 80 columns]\n"
     ]
    }
   ],
   "source": [
    "print(df.describe())\n",
    "print(df2.describe())\n",
    "print(df3.describe())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "При просмотре вывода не было замечено аномалий в столбцах датасетов."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Проблема пропущенных данных"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DATASET 1\n",
      "DATASET 2\n",
      "DATASET 3\n"
     ]
    }
   ],
   "source": [
    "print(\"DATASET 1\")\n",
    "for i in df.columns:\n",
    "    null_rate = df[i].isnull().sum() / len(df)*100\n",
    "    if null_rate > 0:\n",
    "        print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n",
    "print(\"DATASET 2\")\n",
    "for i in df2.columns:\n",
    "    null_rate = df2[i].isnull().sum() / len(df2)*100\n",
    "    if null_rate > 0:\n",
    "        print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n",
    "print(\"DATASET 3\")\n",
    "for i in df3.columns:\n",
    "    null_rate = df3[i].isnull().sum() / len(df3)*100\n",
    "    if null_rate > 0:\n",
    "        print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Во всех датасетах пустых значений не найдено."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Разбиение на выборки\n",
    "Для разбиения на выборке для начала уменьшим количество уникальных значений в столбцах с целевыми признаками, путем добавления новых столбцов с малым количеством уникальных значений."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\goldfest\\AppData\\Local\\Temp\\ipykernel_12052\\2802807477.py:9: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df3_filtered['new_price'] = pd.cut(df3_filtered['Open'], bins=[100, 120, 130, 140, 160, 180], labels=[1, 2, 3, 4, 5], include_lowest=True)\n"
     ]
    }
   ],
   "source": [
    "# Добавление нового столбца для первого датасета с рейтингом от 1 до 5\n",
    "df['new_rating'] = pd.cut(df['Price'], bins=[0, 40000, 80000, 120000, 180000, 50000000], labels=[1, 2, 3, 4, 5], include_lowest=True)\n",
    "\n",
    "# Добавление нового столбца для второго датасета с диапазоном цен от 1 до 5\n",
    "df2['new_high'] = pd.cut(df2['Store_Sales'], bins=[0, 25000, 50000, 75000, 100000, 127000], labels=[1, 2, 3, 4, 5], include_lowest=True)\n",
    "\n",
    "# Фильтрация третьего датасета по цене и добавление категории цен от 1 до 5\n",
    "df3_filtered = df3[(df3['Open'] >= 100) & (df3['Open'] <= 160)]\n",
    "df3_filtered['new_price'] = pd.cut(df3_filtered['Open'], bins=[100, 120, 130, 140, 160, 180], labels=[1, 2, 3, 4, 5], include_lowest=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "def split_stratified_into_train_val_test(\n",
    "    df_input,\n",
    "    stratify_colname=\"y\",\n",
    "    frac_train=0.6,\n",
    "    frac_val=0.15,\n",
    "    frac_test=0.25,\n",
    "    random_state=None,\n",
    "):\n",
    "    if frac_train + frac_val + frac_test != 1.0:\n",
    "        raise ValueError(\n",
    "            \"fractions %f, %f, %f do not add up to 1.0\"\n",
    "            % (frac_train, frac_val, frac_test)\n",
    "        )\n",
    "\n",
    "    if stratify_colname not in df_input.columns:\n",
    "        raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
    "\n",
    "    X = df_input  \n",
    "    y = df_input[\n",
    "        [stratify_colname]\n",
    "    ]  \n",
    "\n",
    "    df_train, df_temp, y_train, y_temp = train_test_split(\n",
    "        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
    "    )\n",
    "\n",
    "    relative_frac_test = frac_test / (frac_val + frac_test)\n",
    "    df_val, df_test, y_val, y_test = train_test_split(\n",
    "        df_temp,\n",
    "        y_temp,\n",
    "        stratify=y_temp,\n",
    "        test_size=relative_frac_test,\n",
    "        random_state=random_state,\n",
    "    )\n",
    "\n",
    "    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
    "\n",
    "    return df_train, df_val, df_test"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Выборки датасетов"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DATASET 1\n",
      "Train: (11542, 19), Val: (3847, 19), Test: (3848, 19)\n",
      "DATASET 2\n",
      "Train: (537, 6), Val: (179, 6), Test: (180, 6)\n",
      "DATASET 3\n",
      "Train: (929, 82), Val: (310, 82), Test: (310, 82)\n"
     ]
    }
   ],
   "source": [
    "# Разбиение на выборки для каждого датасета\n",
    "df_train1, df_val1, df_test1 = split_stratified_into_train_val_test(\n",
    "    df, stratify_colname=\"new_rating\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
    ")\n",
    "\n",
    "df_train2, df_val2, df_test2 = split_stratified_into_train_val_test(\n",
    "    df2, stratify_colname=\"new_high\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
    ")\n",
    "\n",
    "df_train3, df_val3, df_test3 = split_stratified_into_train_val_test(\n",
    "    df3_filtered, stratify_colname=\"new_price\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
    ")\n",
    "\n",
    "# Проверка размеров выборок\n",
    "print(\"DATASET 1\")\n",
    "print(f\"Train: {df_train1.shape}, Val: {df_val1.shape}, Test: {df_test1.shape}\")\n",
    "\n",
    "print(\"DATASET 2\")\n",
    "print(f\"Train: {df_train2.shape}, Val: {df_val2.shape}, Test: {df_test2.shape}\")\n",
    "\n",
    "print(\"DATASET 3\")\n",
    "print(f\"Train: {df_train3.shape}, Val: {df_val3.shape}, Test: {df_test3.shape}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Было сделано разбиение на три выборки: 60%, 20% и 20% при помощи библиотеки scikit-learn и функции train_test_split. На взгляд сбалансированные\n",
    "### Приращение методами выборки с избытком (oversampling) и выборки с недостатком (undersampling)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Выборка до oversampling и undersampling (датасет 1): (11542, 6)\n",
      "new_rating\n",
      "1    10485\n",
      "2      919\n",
      "3      102\n",
      "4       27\n",
      "5        9\n",
      "Name: count, dtype: int64\n",
      "Выборка после oversampling (датасет 1):  (52450, 6)\n",
      "new_rating\n",
      "2    10509\n",
      "3    10490\n",
      "1    10485\n",
      "5    10484\n",
      "4    10482\n",
      "Name: count, dtype: int64\n",
      "Выборка после undersampling (датасет 1):  (45, 6)\n",
      "new_rating\n",
      "1    9\n",
      "2    9\n",
      "3    9\n",
      "4    9\n",
      "5    9\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "from imblearn.over_sampling import ADASYN\n",
    "from imblearn.under_sampling import RandomUnderSampler\n",
    "\n",
    "df_train1 = df_train1[['Price', 'Levy', 'Mileage', 'Prod. year', 'Mileage', 'new_rating']].copy()\n",
    "\n",
    "ada = ADASYN()\n",
    "undersampler = RandomUnderSampler(random_state=42)\n",
    "\n",
    "print(\"Выборка до oversampling и undersampling (датасет 1):\", df_train1.shape)\n",
    "print(df_train1['new_rating'].value_counts())\n",
    "\n",
    "X_resampled, y_resampled = ada.fit_resample(df_train1, df_train1['new_rating'])\n",
    "df_train1_adasyn = pd.DataFrame(X_resampled)\n",
    "\n",
    "print(\"Выборка после oversampling (датасет 1): \", df_train1_adasyn.shape)\n",
    "print(df_train1_adasyn.new_rating.value_counts())\n",
    "\n",
    "X_resampled_under, y_resampled_under = undersampler.fit_resample(df_train1, df_train1['new_rating'])\n",
    "\n",
    "print(\"Выборка после undersampling (датасет 1): \", pd.DataFrame(X_resampled_under).shape)\n",
    "print(pd.DataFrame(X_resampled_under)['new_rating'].value_counts())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Выборка до oversampling и undersampling: (537, 5)\n",
      "new_high\n",
      "3    253\n",
      "2    167\n",
      "4    105\n",
      "1      8\n",
      "5      4\n",
      "Name: count, dtype: int64\n",
      "Выборка после oversampling: (1265, 5)\n",
      "new_high\n",
      "1    253\n",
      "2    253\n",
      "3    253\n",
      "4    253\n",
      "5    253\n",
      "Name: count, dtype: int64\n",
      "Выборка после undersampling: (20, 5)\n",
      "new_high\n",
      "1    4\n",
      "2    4\n",
      "3    4\n",
      "4    4\n",
      "5    4\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "from imblearn.over_sampling import SMOTE\n",
    "df_train2 = df_train2[['Store_Sales', 'Store_Area', 'Items_Available', 'Daily_Customer_Count', 'new_high']].copy()\n",
    "\n",
    "smote = SMOTE(random_state=42, k_neighbors=2)\n",
    "undersampler = RandomUnderSampler(random_state=42)\n",
    "\n",
    "print(\"Выборка до oversampling и undersampling:\", df_train2.shape)\n",
    "print(df_train2['new_high'].value_counts())\n",
    "\n",
    "X_resampled, y_resampled = smote.fit_resample(df_train2, df_train2['new_high'])\n",
    "df_train2_smote = pd.DataFrame(X_resampled, columns=df_train2.columns)\n",
    "\n",
    "print(\"Выборка после oversampling:\", df_train2_smote.shape)\n",
    "print(df_train2_smote['new_high'].value_counts())\n",
    "\n",
    "X_resampled_under2, y_resampled_under2 = undersampler.fit_resample(df_train2, df_train2['new_high'])\n",
    "df_train2_under = pd.DataFrame(X_resampled_under2, columns=df_train2.columns)\n",
    "\n",
    "print(\"Выборка после undersampling:\", df_train2_under.shape)\n",
    "print(df_train2_under['new_high'].value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Выборка до oversampling и undersampling (датасет 3): (929, 6)\n",
      "new_price\n",
      "1    428\n",
      "2    366\n",
      "4     98\n",
      "3     37\n",
      "5      0\n",
      "Name: count, dtype: int64\n",
      "Выборка после oversampling (датасет 3): (1712, 6)\n",
      "new_price\n",
      "1    428\n",
      "2    428\n",
      "3    428\n",
      "4    428\n",
      "5      0\n",
      "Name: count, dtype: int64\n",
      "Выборка после undersampling (датасет 3): (148, 6)\n",
      "new_price\n",
      "1    37\n",
      "2    37\n",
      "3    37\n",
      "4    37\n",
      "5     0\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "from imblearn.over_sampling import RandomOverSampler\n",
    "\n",
    "df_train3 = df_train3[['Open', 'High', 'Low', 'Close', 'Volume', 'new_price']].copy()\n",
    "\n",
    "oversampler = RandomOverSampler(random_state=42)\n",
    "undersampler = RandomUnderSampler(random_state=42)\n",
    "\n",
    "print(\"Выборка до oversampling и undersampling (датасет 3):\", df_train3.shape)\n",
    "print(df_train3['new_price'].value_counts())\n",
    "\n",
    "X_resampled, y_resampled = oversampler.fit_resample(df_train3, df_train3['new_price'])\n",
    "df_train3_oversampled = pd.DataFrame(X_resampled, columns=df_train3.columns)\n",
    "\n",
    "print(\"Выборка после oversampling (датасет 3):\", df_train3_oversampled.shape)\n",
    "print(df_train3_oversampled['new_price'].value_counts())\n",
    "\n",
    "X_resampled_under, y_resampled_under = undersampler.fit_resample(df_train3, df_train3['new_price'])\n",
    "df_train3_under = pd.DataFrame(X_resampled_under, columns=df_train3.columns)\n",
    "\n",
    "print(\"Выборка после undersampling (датасет 3):\", df_train3_under.shape)\n",
    "print(df_train3_under['new_price'].value_counts())\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}