AIM-PIbd-31-Yakovlev-M-G/lab_2/lab_2.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Lab2 PIbd-31 Yakovlev\n",
    "Загрузим три датасета"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 532 entries, 0 to 531\n",
      "Data columns (total 18 columns):\n",
      " #   Column               Non-Null Count  Dtype  \n",
      "---  ------               --------------  -----  \n",
      " 0   brand_name           532 non-null    object \n",
      " 1   price                532 non-null    int64  \n",
      " 2   rating               532 non-null    int64  \n",
      " 3   processor_gen        520 non-null    object \n",
      " 4   processor_brand      532 non-null    object \n",
      " 5   processor_segment    528 non-null    object \n",
      " 6   CPU_mark             532 non-null    object \n",
      " 7   CPU_performance      532 non-null    object \n",
      " 8   Graphic_card_memory  530 non-null    object \n",
      " 9   graphic_card_name    530 non-null    object \n",
      " 10  graphic_card_num     532 non-null    object \n",
      " 11  Core                 530 non-null    float64\n",
      " 12  threads              514 non-null    float64\n",
      " 13  display_inches       532 non-null    object \n",
      " 14  ram_storage          532 non-null    int64  \n",
      " 15  ram_type             532 non-null    object \n",
      " 16  operating_system     502 non-null    float64\n",
      " 17  SSD_storage          532 non-null    object \n",
      "dtypes: float64(3), int64(3), object(12)\n",
      "memory usage: 74.9+ KB\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 8036 entries, 0 to 8035\n",
      "Data columns (total 7 columns):\n",
      " #   Column     Non-Null Count  Dtype  \n",
      "---  ------     --------------  -----  \n",
      " 0   Date       8036 non-null   object \n",
      " 1   Open       8036 non-null   float64\n",
      " 2   High       8036 non-null   float64\n",
      " 3   Low        8036 non-null   float64\n",
      " 4   Close      8036 non-null   float64\n",
      " 5   Adj Close  8036 non-null   float64\n",
      " 6   Volume     8036 non-null   int64  \n",
      "dtypes: float64(5), int64(1), object(1)\n",
      "memory usage: 439.6+ KB\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 19237 entries, 0 to 19236\n",
      "Data columns (total 18 columns):\n",
      " #   Column            Non-Null Count  Dtype  \n",
      "---  ------            --------------  -----  \n",
      " 0   ID                19237 non-null  int64  \n",
      " 1   Price             19237 non-null  int64  \n",
      " 2   Levy              19237 non-null  object \n",
      " 3   Manufacturer      19237 non-null  object \n",
      " 4   Model             19237 non-null  object \n",
      " 5   Prod. year        19237 non-null  int64  \n",
      " 6   Category          19237 non-null  object \n",
      " 7   Leather interior  19237 non-null  object \n",
      " 8   Fuel type         19237 non-null  object \n",
      " 9   Engine volume     19237 non-null  object \n",
      " 10  Mileage           19237 non-null  object \n",
      " 11  Cylinders         19237 non-null  float64\n",
      " 12  Gear box type     19237 non-null  object \n",
      " 13  Drive wheels      19237 non-null  object \n",
      " 14  Doors             19237 non-null  object \n",
      " 15  Wheel             19237 non-null  object \n",
      " 16  Color             19237 non-null  object \n",
      " 17  Airbags           19237 non-null  int64  \n",
      "dtypes: float64(1), int64(4), object(13)\n",
      "memory usage: 2.6+ MB\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"datasets/laptop.csv\")\n",
    "df2 = pd.read_csv(\"datasets/coffee.csv\")\n",
    "df3 = pd.read_csv(\"datasets/car_price_prediction.csv\")\n",
    "df.info()\n",
    "df2.info()\n",
    "df3.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Проблемная область\n",
    "Первый датасет позволяет проанализировать данные, и понять какие ноутбуки имеют превосходство на рынке техники, и какие чаще всего выбирают пользователи.\n",
    "Второй датасет позволяет при помощи данных акций за последние 25 лет спрогнозировать будущие показатели акций кофейни Starbucks\n",
    "Третий датасет позволяет проанализировать данные, и спрогнозировать категорию цены для машины, по ее комплектующим.\n",
    "#### Анализ набора данных\n",
    "Объекты наблюдения - игровые ноутбуки, акции, машины\n",
    "Атрибуты - \n",
    "1. Имя бренда, цена, рейтинг, поколение процессора, марка процессора, сегмент процессора, специальная метка, производительность процессора, видеокарта, видеопамять, номер видеокарты, кол-во потоков видеокарты, размер дисплея, оперативная память, тип оперативной памяти, ОС, SSD.\n",
    "2. Дата, начальная цена за день, максимальная цена, минимальная цена, цена на момент закрытия продаж, скорректированая цена на момент закрытия, объем торговли акций за день.\n",
    "3. Цена обслуживания, производитель, модель, год выпуска, категория, кожанный салон, тип топлива, объем двигателя.\n",
    "Связи между объектами - нет\n",
    "#### Бизнес-цели\n",
    "1. Какие модели игровых ноутбуков более выгодно продавать магазинам техники. Какие комплектующие наиболее популярны у производителей и покупателей, для дальнейшего увеличения производства данных комплектующих.\n",
    "2. Прогноз цен, для дальнешей покупки, продажи акций. Прогнозирование, для предотвращения упадка.\n",
    "3. Для составления списка лучших моделей автомобилей. Определения наилучшего буджетного автомобиля, который не будет часто ломаться и приносить убытки.\n",
    "#### Примеры целей технического проекта. Что поступает на вход, что является целевым признаком.\n",
    "На входе всегда датасет, целевые признаки:\n",
    "1. Рейтинг ноутбука\n",
    "2. Максимальная цена за день\n",
    "3. Цена обслуживания\n",
    "#### Проблемы набора данных и их решения\n",
    "1. Возможны устаревшие данные. Для решения данной проблемы требуется удаление самых старых записей, и добавление более новых.\n",
    "2. Возможны выбросы. Решить эту проблему помогает удаление таких выбросов. В маленькой выборке не рекомендуется удалять выбросы. В большой выборке выбросы усредняются.\n",
    "#### Качество набора данных\n",
    "Наборы данных содержат достаточно примеров и признаков для обучения модели. Учтены различные ситуации проблемной области. Данные соответствуют данным, которые будут\n",
    "подаваться в производственной среде. Все метки согласованы.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Поиск аномалий"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "               price      rating        Core     threads  ram_storage  \\\n",
      "count     532.000000  532.000000  530.000000  514.000000   532.000000   \n",
      "mean   107684.492481   67.781955    9.035849   15.089494    15.676692   \n",
      "std     80187.648965    8.161356    4.413487    5.216162     8.901257   \n",
      "min     30999.000000   43.000000    2.000000    4.000000     4.000000   \n",
      "25%     62371.750000   63.000000    6.000000   12.000000     8.000000   \n",
      "50%     83745.000000   66.000000    8.000000   16.000000    16.000000   \n",
      "75%    114040.000000   72.000000   10.000000   16.000000    16.000000   \n",
      "max    599990.000000   98.000000   24.000000   32.000000    64.000000   \n",
      "\n",
      "       operating_system  \n",
      "count        502.000000  \n",
      "mean          10.842629  \n",
      "std            0.364513  \n",
      "min           10.000000  \n",
      "25%           11.000000  \n",
      "50%           11.000000  \n",
      "75%           11.000000  \n",
      "max           11.000000  \n",
      "              Open         High          Low        Close    Adj Close  \\\n",
      "count  8036.000000  8036.000000  8036.000000  8036.000000  8036.000000   \n",
      "mean     30.054280    30.351487    29.751322    30.058857    26.674025   \n",
      "std      33.615577    33.906613    33.314569    33.615911    31.728090   \n",
      "min       0.328125     0.347656     0.320313     0.335938     0.260703   \n",
      "25%       4.392031     4.531250     4.304922     4.399610     3.414300   \n",
      "50%      13.325000    13.493750    13.150000    13.330000    10.352452   \n",
      "75%      55.250000    55.722501    54.852499    55.267499    47.464829   \n",
      "max     126.080002   126.320000   124.809998   126.059998   118.010414   \n",
      "\n",
      "             Volume  \n",
      "count  8.036000e+03  \n",
      "mean   1.470459e+07  \n",
      "std    1.340021e+07  \n",
      "min    1.504000e+06  \n",
      "25%    7.817750e+06  \n",
      "50%    1.169815e+07  \n",
      "75%    1.778795e+07  \n",
      "max    5.855088e+08  \n",
      "                 ID         Price    Prod. year     Cylinders       Airbags\n",
      "count  1.923700e+04  1.923700e+04  19237.000000  19237.000000  19237.000000\n",
      "mean   4.557654e+07  1.855593e+04   2010.912824      4.582991      6.582627\n",
      "std    9.365914e+05  1.905813e+05      5.668673      1.199933      4.320168\n",
      "min    2.074688e+07  1.000000e+00   1939.000000      1.000000      0.000000\n",
      "25%    4.569837e+07  5.331000e+03   2009.000000      4.000000      4.000000\n",
      "50%    4.577231e+07  1.317200e+04   2012.000000      4.000000      6.000000\n",
      "75%    4.580204e+07  2.207500e+04   2015.000000      4.000000     12.000000\n",
      "max    4.581665e+07  2.630750e+07   2020.000000     16.000000     16.000000\n"
     ]
    }
   ],
   "source": [
    "print(df.describe())\n",
    "print(df2.describe())\n",
    "print(df3.describe())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "При просмотре вывода не было замечено аномалий в столбцах датасетов."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Проблема пропущенных данных"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DATASET 1\n",
      "processor_gen процент пустых значений: %2.26\n",
      "processor_segment процент пустых значений: %0.75\n",
      "Graphic_card_memory процент пустых значений: %0.38\n",
      "graphic_card_name процент пустых значений: %0.38\n",
      "Core процент пустых значений: %0.38\n",
      "threads процент пустых значений: %3.38\n",
      "operating_system процент пустых значений: %5.64\n",
      "DATASET 2\n",
      "DATASET 3\n"
     ]
    }
   ],
   "source": [
    "print(\"DATASET 1\")\n",
    "for i in df.columns:\n",
    "    null_rate = df[i].isnull().sum() / len(df)*100\n",
    "    if null_rate > 0:\n",
    "        print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n",
    "print(\"DATASET 2\")\n",
    "for i in df2.columns:\n",
    "    null_rate = df2[i].isnull().sum() / len(df2)*100\n",
    "    if null_rate > 0:\n",
    "        print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n",
    "print(\"DATASET 3\")\n",
    "for i in df3.columns:\n",
    "    null_rate = df3[i].isnull().sum() / len(df3)*100\n",
    "    if null_rate > 0:\n",
    "        print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "В первом датасете были поля с пустыми значениями, в остальных пустых значений не найдено."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "brand_name             False\n",
      "price                  False\n",
      "rating                 False\n",
      "processor_gen          False\n",
      "processor_brand        False\n",
      "processor_segment      False\n",
      "CPU_mark               False\n",
      "CPU_performance        False\n",
      "Graphic_card_memory    False\n",
      "graphic_card_name      False\n",
      "graphic_card_num       False\n",
      "Core                   False\n",
      "threads                False\n",
      "display_inches         False\n",
      "ram_storage            False\n",
      "ram_type               False\n",
      "operating_system       False\n",
      "SSD_storage            False\n",
      "dtype: bool\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>brand_name</th>\n",
       "      <th>price</th>\n",
       "      <th>rating</th>\n",
       "      <th>processor_gen</th>\n",
       "      <th>processor_brand</th>\n",
       "      <th>processor_segment</th>\n",
       "      <th>CPU_mark</th>\n",
       "      <th>CPU_performance</th>\n",
       "      <th>Graphic_card_memory</th>\n",
       "      <th>graphic_card_name</th>\n",
       "      <th>graphic_card_num</th>\n",
       "      <th>Core</th>\n",
       "      <th>threads</th>\n",
       "      <th>display_inches</th>\n",
       "      <th>ram_storage</th>\n",
       "      <th>ram_type</th>\n",
       "      <th>operating_system</th>\n",
       "      <th>SSD_storage</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>527</th>\n",
       "      <td>dell</td>\n",
       "      <td>75500</td>\n",
       "      <td>63</td>\n",
       "      <td>4th</td>\n",
       "      <td>amd</td>\n",
       "      <td>5</td>\n",
       "      <td>4600H</td>\n",
       "      <td>maximum performance</td>\n",
       "      <td>6 GB</td>\n",
       "      <td>amd radeon</td>\n",
       "      <td>other</td>\n",
       "      <td>6.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>15.6</td>\n",
       "      <td>8</td>\n",
       "      <td>DDR4</td>\n",
       "      <td>10.0</td>\n",
       "      <td>512 GB SSD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>528</th>\n",
       "      <td>lenovo</td>\n",
       "      <td>151990</td>\n",
       "      <td>75</td>\n",
       "      <td>10th</td>\n",
       "      <td>intel</td>\n",
       "      <td>i7</td>\n",
       "      <td>10875H</td>\n",
       "      <td>maximum performance</td>\n",
       "      <td>8 GB</td>\n",
       "      <td>nvidia geforce</td>\n",
       "      <td>other</td>\n",
       "      <td>8.0</td>\n",
       "      <td>16.0</td>\n",
       "      <td>15.6</td>\n",
       "      <td>16</td>\n",
       "      <td>DDR4</td>\n",
       "      <td>10.0</td>\n",
       "      <td>1 TB SSD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>529</th>\n",
       "      <td>lenovo</td>\n",
       "      <td>46500</td>\n",
       "      <td>48</td>\n",
       "      <td>8th</td>\n",
       "      <td>intel</td>\n",
       "      <td>i5</td>\n",
       "      <td>8250U</td>\n",
       "      <td>ultra-low power</td>\n",
       "      <td>Integrated</td>\n",
       "      <td>Intel Integrated</td>\n",
       "      <td>other</td>\n",
       "      <td>4.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>other</td>\n",
       "      <td>4</td>\n",
       "      <td>DDR4</td>\n",
       "      <td>0.0</td>\n",
       "      <td>other</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>530</th>\n",
       "      <td>msi</td>\n",
       "      <td>109990</td>\n",
       "      <td>61</td>\n",
       "      <td>9th</td>\n",
       "      <td>intel</td>\n",
       "      <td>i7</td>\n",
       "      <td>9750H</td>\n",
       "      <td>maximum performance</td>\n",
       "      <td>6 GB</td>\n",
       "      <td>nvidia geforce</td>\n",
       "      <td>other</td>\n",
       "      <td>6.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>other</td>\n",
       "      <td>8</td>\n",
       "      <td>other</td>\n",
       "      <td>0.0</td>\n",
       "      <td>other</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>531</th>\n",
       "      <td>hp</td>\n",
       "      <td>95800</td>\n",
       "      <td>70</td>\n",
       "      <td>9th</td>\n",
       "      <td>intel</td>\n",
       "      <td>i7</td>\n",
       "      <td>9750H</td>\n",
       "      <td>maximum performance</td>\n",
       "      <td>4 GB</td>\n",
       "      <td>nvidia geforce</td>\n",
       "      <td>1650</td>\n",
       "      <td>6.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>15.6</td>\n",
       "      <td>8</td>\n",
       "      <td>DDR4</td>\n",
       "      <td>0.0</td>\n",
       "      <td>other</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    brand_name   price  rating processor_gen processor_brand  \\\n",
       "527       dell   75500      63           4th             amd   \n",
       "528     lenovo  151990      75          10th           intel   \n",
       "529     lenovo   46500      48           8th           intel   \n",
       "530        msi  109990      61           9th           intel   \n",
       "531         hp   95800      70           9th           intel   \n",
       "\n",
       "    processor_segment CPU_mark      CPU_performance Graphic_card_memory  \\\n",
       "527                 5    4600H  maximum performance                6 GB   \n",
       "528                i7   10875H  maximum performance                8 GB   \n",
       "529                i5    8250U      ultra-low power          Integrated   \n",
       "530                i7    9750H  maximum performance                6 GB   \n",
       "531                i7    9750H  maximum performance                4 GB   \n",
       "\n",
       "    graphic_card_name graphic_card_num  Core  threads display_inches  \\\n",
       "527        amd radeon            other   6.0     12.0           15.6   \n",
       "528    nvidia geforce            other   8.0     16.0           15.6   \n",
       "529  Intel Integrated            other   4.0      8.0          other   \n",
       "530    nvidia geforce            other   6.0     12.0          other   \n",
       "531    nvidia geforce             1650   6.0     12.0           15.6   \n",
       "\n",
       "     ram_storage ram_type  operating_system SSD_storage  \n",
       "527            8     DDR4              10.0  512 GB SSD  \n",
       "528           16     DDR4              10.0    1 TB SSD  \n",
       "529            4     DDR4               0.0       other  \n",
       "530            8    other               0.0       other  \n",
       "531            8     DDR4               0.0       other  "
      ]
     },
     "execution_count": 127,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = df.fillna(0) #Замена пустых значений на 0\n",
    "print(df.isnull().any())\n",
    "df.tail()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Разбиение на выборки\n",
    "Для разбиения на выборке для начала уменьшим количество уникальных значений в столбцах с целевыми признаками, путем добавления новых столбцов с малым количеством уникальных значений."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [],
   "source": [
    "#У первого дата сета добавим новый столбец с рейтингом от 1 до 5 на основе столбца от 1 до 100.\n",
    "\n",
    "df['new_rating'] = pd.cut(df['rating'], bins=[0,20,40,60,80,100], labels=[1,2,3,4,5], include_lowest=True)\n",
    "\n",
    "#У второго добавим столбец с наибольшей ценой от 1 до 10, на основе столбца от 1 до 127.\n",
    "\n",
    "df2['new_high'] = pd.cut(df2['High'], bins=[0,13,26,39,52,65,78,91,104,117,130], labels=[1,2,3,4,5,6,7,8,9,10], include_lowest=True)\n",
    "\n",
    "#У третьего удалим слишком большие значения обслуживания и слишком маленькие и добавим новый столбец с категориями цен от 1 до 5.\n",
    "\n",
    "df3_filtered = df3[df3['Price'] >= 10000]\n",
    "df3_filtered = df3_filtered[df3_filtered['Price'] <= 100000]\n",
    "df3_filtered['new_price'] = pd.cut(df3_filtered['Price'], bins=[10000,28000,46000,64000,82000,100000], labels=[1,2,3,4,5], include_lowest=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "def split_stratified_into_train_val_test(\n",
    "    df_input,\n",
    "    stratify_colname=\"y\",\n",
    "    frac_train=0.6,\n",
    "    frac_val=0.15,\n",
    "    frac_test=0.25,\n",
    "    random_state=None,\n",
    "):\n",
    "    if frac_train + frac_val + frac_test != 1.0:\n",
    "        raise ValueError(\n",
    "            \"fractions %f, %f, %f do not add up to 1.0\"\n",
    "            % (frac_train, frac_val, frac_test)\n",
    "        )\n",
    "\n",
    "    if stratify_colname not in df_input.columns:\n",
    "        raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
    "\n",
    "    X = df_input  # Contains all columns.\n",
    "    y = df_input[\n",
    "        [stratify_colname]\n",
    "    ]  # Dataframe of just the column on which to stratify.\n",
    "\n",
    "    # Split original dataframe into train and temp dataframes.\n",
    "    df_train, df_temp, y_train, y_temp = train_test_split(\n",
    "        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
    "    )\n",
    "\n",
    "    # Split the temp dataframe into val and test dataframes.\n",
    "    relative_frac_test = frac_test / (frac_val + frac_test)\n",
    "    df_val, df_test, y_val, y_test = train_test_split(\n",
    "        df_temp,\n",
    "        y_temp,\n",
    "        stratify=y_temp,\n",
    "        test_size=relative_frac_test,\n",
    "        random_state=random_state,\n",
    "    )\n",
    "\n",
    "    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
    "\n",
    "    return df_train, df_val, df_test"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Выборки датасетов"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train df: (319, 19), Val df: (106, 19), Test df:(107, 19)\n",
      "train df2: (4821, 8), Val df2: (1607, 8), Test df2:(1608, 8)\n",
      "train df3_filtered: (6931, 19), Val df3_filtered: (2310, 19), Test df3_filtered:(2311, 19)\n"
     ]
    }
   ],
   "source": [
    "df_train1, df_val1, df_test1 = split_stratified_into_train_val_test(\n",
    "    df, stratify_colname=\"new_rating\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
    ")\n",
    "\n",
    "df_train2, df_val2, df_test2 = split_stratified_into_train_val_test(\n",
    "    df2, stratify_colname=\"new_high\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
    ")\n",
    "\n",
    "df_train3, df_val3, df_test3 = split_stratified_into_train_val_test(\n",
    "    df3_filtered, stratify_colname=\"new_price\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
    ")\n",
    "\n",
    "print(f\"train df: {df_train1.shape}, Val df: {df_val1.shape}, Test df:{df_test1.shape}\")\n",
    "print(f\"train df2: {df_train2.shape}, Val df2: {df_val2.shape}, Test df2:{df_test2.shape}\")\n",
    "print(f\"train df3_filtered: {df_train3.shape}, Val df3_filtered: {df_val3.shape}, Test df3_filtered:{df_test3.shape}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Было сделано разбиение на три выборки: 60%, 20% и 20% при помощи библиотеки scikit-learn и функции train_test_split. На взгляд сбалансированные\n",
    "### Приращение методами выборки с избытком (oversampling) и выборки с недостатком (undersampling)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Выборка до oversampling и undersampling: (319, 6)\n",
      "new_rating\n",
      "4    249\n",
      "3     44\n",
      "5     26\n",
      "1      0\n",
      "2      0\n",
      "Name: count, dtype: int64\n",
      "Выборка после oversampling:  (750, 6)\n",
      "new_rating\n",
      "5    251\n",
      "3    250\n",
      "4    249\n",
      "1      0\n",
      "2      0\n",
      "Name: count, dtype: int64\n",
      "Выборка после undersampling:  (78, 6)\n",
      "new_rating\n",
      "3    26\n",
      "5    26\n",
      "4    26\n",
      "1     0\n",
      "2     0\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "from imblearn.over_sampling import ADASYN\n",
    "from imblearn.under_sampling import RandomUnderSampler\n",
    "\n",
    "df_train1 = df_train1[['price', 'rating', 'threads', 'ram_storage', 'operating_system', 'new_rating']].copy()\n",
    "\n",
    "ada = ADASYN()\n",
    "undersampler = RandomUnderSampler(random_state=42)\n",
    "\n",
    "print(\"Выборка до oversampling и undersampling:\", df_train1.shape)\n",
    "print(df_train1.new_rating.value_counts())\n",
    "\n",
    "X_resampled, y_resampled = ada.fit_resample(df_train1, df_train1['new_rating'])\n",
    "df_train1_adasyn = pd.DataFrame(X_resampled)\n",
    "\n",
    "print(\"Выборка после oversampling: \", df_train1_adasyn.shape)\n",
    "print(df_train1_adasyn.new_rating.value_counts())\n",
    "\n",
    "X_resampled_under, y_resampled_under = undersampler.fit_resample(df_train1, df_train1['new_rating'])\n",
    "\n",
    "print(\"Выборка после undersampling: \", pd.DataFrame(X_resampled_under).shape)\n",
    "print(pd.DataFrame(X_resampled_under).new_rating.value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Выборка до oversampling и undersampling: (4821, 7)\n",
      "new_high\n",
      "1     2326\n",
      "2      704\n",
      "5      519\n",
      "3      299\n",
      "8      242\n",
      "7      222\n",
      "9      181\n",
      "4      151\n",
      "6      146\n",
      "10      31\n",
      "Name: count, dtype: int64\n",
      "Выборка после oversampling:  (23144, 7)\n",
      "new_high\n",
      "8     2374\n",
      "6     2368\n",
      "2     2351\n",
      "4     2335\n",
      "1     2326\n",
      "9     2317\n",
      "10    2312\n",
      "5     2256\n",
      "7     2256\n",
      "3     2249\n",
      "Name: count, dtype: int64\n",
      "Выборка после undersampling:  (310, 7)\n",
      "new_high\n",
      "1     31\n",
      "2     31\n",
      "3     31\n",
      "4     31\n",
      "5     31\n",
      "6     31\n",
      "7     31\n",
      "8     31\n",
      "9     31\n",
      "10    31\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "df_train2 = df_train2[['Open', 'High', 'new_high', 'Low', 'Close', 'Adj Close', 'Volume']].copy()\n",
    "\n",
    "print(\"Выборка до oversampling и undersampling:\", df_train2.shape)\n",
    "print(df_train2.new_high.value_counts())\n",
    "\n",
    "X_resampled, y_resampled = ada.fit_resample(df_train2, df_train2['new_high'])\n",
    "df_train2_adasyn = pd.DataFrame(X_resampled)\n",
    "\n",
    "print(\"Выборка после oversampling: \", df_train2_adasyn.shape)\n",
    "print(df_train2_adasyn.new_high.value_counts())\n",
    "\n",
    "X_resampled_under2, y_resampled_under2 = undersampler.fit_resample(df_train2, df_train2['new_high'])\n",
    "\n",
    "print(\"Выборка после undersampling: \", pd.DataFrame(X_resampled_under2).shape)\n",
    "print(pd.DataFrame(X_resampled_under2).new_high.value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Выборка до oversampling и undersampling: (6931, 5)\n",
      "new_price\n",
      "1    5008\n",
      "2    1281\n",
      "3     449\n",
      "4     136\n",
      "5      57\n",
      "Name: count, dtype: int64\n",
      "Выборка после oversampling:  (25040, 5)\n",
      "new_price\n",
      "1    5008\n",
      "2    5008\n",
      "3    5008\n",
      "4    5008\n",
      "5    5008\n",
      "Name: count, dtype: int64\n",
      "Выборка после undersampling:  (285, 5)\n",
      "new_price\n",
      "1    57\n",
      "2    57\n",
      "3    57\n",
      "4    57\n",
      "5    57\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "from imblearn.over_sampling import SMOTE\n",
    "\n",
    "df_train3 = df_train3[['Price', 'new_price','Prod. year' ,'Cylinders' ,'Airbags']].copy()\n",
    "\n",
    "smote = SMOTE(random_state=42)\n",
    "\n",
    "print(\"Выборка до oversampling и undersampling:\", df_train3.shape)\n",
    "print(df_train3.new_price.value_counts())\n",
    "\n",
    "X_resampled, y_resampled = smote.fit_resample(df_train3, df_train3['new_price'])\n",
    "df_train3_smote = pd.DataFrame(X_resampled)\n",
    "\n",
    "print(\"Выборка после oversampling: \", df_train3_smote.shape)\n",
    "print(df_train3_smote.new_price.value_counts())\n",
    "\n",
    "X_resampled_under3, y_resampled_under3 = undersampler.fit_resample(df_train3, df_train3['new_price'])\n",
    "\n",
    "print(\"Выборка после undersampling: \", pd.DataFrame(X_resampled_under3).shape)\n",
    "print(pd.DataFrame(X_resampled_under3).new_price.value_counts())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "kernel",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}