{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### DataSet - \"Gaming Laptop Specs and Price\"\n",
    "Данный датасет содержит данные о игровых ноутбуках."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 532 entries, 0 to 531\n",
      "Data columns (total 18 columns):\n",
      " #   Column               Non-Null Count  Dtype  \n",
      "---  ------               --------------  -----  \n",
      " 0   brand_name           532 non-null    object \n",
      " 1   price                532 non-null    int64  \n",
      " 2   rating               532 non-null    int64  \n",
      " 3   processor_gen        520 non-null    object \n",
      " 4   processor_brand      532 non-null    object \n",
      " 5   processor_segment    528 non-null    object \n",
      " 6   CPU_mark             532 non-null    object \n",
      " 7   CPU_performance      532 non-null    object \n",
      " 8   Graphic_card_memory  530 non-null    object \n",
      " 9   graphic_card_name    530 non-null    object \n",
      " 10  graphic_card_num     532 non-null    object \n",
      " 11  Core                 530 non-null    float64\n",
      " 12  threads              514 non-null    float64\n",
      " 13  display_inches       532 non-null    object \n",
      " 14  ram_storage          532 non-null    int64  \n",
      " 15  ram_type             532 non-null    object \n",
      " 16  operating_system     502 non-null    float64\n",
      " 17  SSD_storage          532 non-null    object \n",
      "dtypes: float64(3), int64(3), object(12)\n",
      "memory usage: 74.9+ KB\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>brand_name</th>\n",
       "      <th>price</th>\n",
       "      <th>rating</th>\n",
       "      <th>processor_gen</th>\n",
       "      <th>processor_brand</th>\n",
       "      <th>processor_segment</th>\n",
       "      <th>CPU_mark</th>\n",
       "      <th>CPU_performance</th>\n",
       "      <th>Graphic_card_memory</th>\n",
       "      <th>graphic_card_name</th>\n",
       "      <th>graphic_card_num</th>\n",
       "      <th>Core</th>\n",
       "      <th>threads</th>\n",
       "      <th>display_inches</th>\n",
       "      <th>ram_storage</th>\n",
       "      <th>ram_type</th>\n",
       "      <th>operating_system</th>\n",
       "      <th>SSD_storage</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>hp</td>\n",
       "      <td>49490</td>\n",
       "      <td>70</td>\n",
       "      <td>5th</td>\n",
       "      <td>amd</td>\n",
       "      <td>5</td>\n",
       "      <td>5600H</td>\n",
       "      <td>maximum performance</td>\n",
       "      <td>4 GB</td>\n",
       "      <td>amd radeon</td>\n",
       "      <td>other</td>\n",
       "      <td>6.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>15.6</td>\n",
       "      <td>8</td>\n",
       "      <td>DDR4</td>\n",
       "      <td>11.0</td>\n",
       "      <td>512 GB SSD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>xiaomi</td>\n",
       "      <td>102990</td>\n",
       "      <td>78</td>\n",
       "      <td>14th</td>\n",
       "      <td>intel</td>\n",
       "      <td>i9</td>\n",
       "      <td>14900HX</td>\n",
       "      <td>maximum performance</td>\n",
       "      <td>8 GB</td>\n",
       "      <td>nvidia geforce</td>\n",
       "      <td>4060</td>\n",
       "      <td>24.0</td>\n",
       "      <td>32.0</td>\n",
       "      <td>other</td>\n",
       "      <td>16</td>\n",
       "      <td>DDR5</td>\n",
       "      <td>11.0</td>\n",
       "      <td>1 TB SSD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>hp</td>\n",
       "      <td>81490</td>\n",
       "      <td>73</td>\n",
       "      <td>7th</td>\n",
       "      <td>amd</td>\n",
       "      <td>7</td>\n",
       "      <td>7840HS</td>\n",
       "      <td>high efficiency</td>\n",
       "      <td>6 GB</td>\n",
       "      <td>nvidia geforce</td>\n",
       "      <td>3050</td>\n",
       "      <td>8.0</td>\n",
       "      <td>16.0</td>\n",
       "      <td>other</td>\n",
       "      <td>16</td>\n",
       "      <td>DDR5</td>\n",
       "      <td>11.0</td>\n",
       "      <td>1 TB SSD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>asus</td>\n",
       "      <td>49990</td>\n",
       "      <td>64</td>\n",
       "      <td>11th</td>\n",
       "      <td>intel</td>\n",
       "      <td>i5</td>\n",
       "      <td>11400H</td>\n",
       "      <td>maximum performance</td>\n",
       "      <td>4 GB</td>\n",
       "      <td>nvidia geforce</td>\n",
       "      <td>2050</td>\n",
       "      <td>6.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>15.6</td>\n",
       "      <td>8</td>\n",
       "      <td>DDR4</td>\n",
       "      <td>11.0</td>\n",
       "      <td>512 GB SSD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>asus</td>\n",
       "      <td>52990</td>\n",
       "      <td>66</td>\n",
       "      <td>11th</td>\n",
       "      <td>intel</td>\n",
       "      <td>i5</td>\n",
       "      <td>11400H</td>\n",
       "      <td>maximum performance</td>\n",
       "      <td>4 GB</td>\n",
       "      <td>nvidia geforce</td>\n",
       "      <td>2050</td>\n",
       "      <td>6.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>15.6</td>\n",
       "      <td>16</td>\n",
       "      <td>DDR4</td>\n",
       "      <td>11.0</td>\n",
       "      <td>512 GB SSD</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  brand_name   price  rating processor_gen processor_brand processor_segment  \\\n",
       "0         hp   49490      70           5th             amd                 5   \n",
       "1     xiaomi  102990      78          14th           intel                i9   \n",
       "2         hp   81490      73           7th             amd                 7   \n",
       "3       asus   49990      64          11th           intel                i5   \n",
       "4       asus   52990      66          11th           intel                i5   \n",
       "\n",
       "  CPU_mark      CPU_performance Graphic_card_memory graphic_card_name  \\\n",
       "0    5600H  maximum performance                4 GB        amd radeon   \n",
       "1  14900HX  maximum performance                8 GB    nvidia geforce   \n",
       "2   7840HS      high efficiency                6 GB    nvidia geforce   \n",
       "3   11400H  maximum performance                4 GB    nvidia geforce   \n",
       "4   11400H  maximum performance                4 GB    nvidia geforce   \n",
       "\n",
       "  graphic_card_num  Core  threads display_inches  ram_storage ram_type  \\\n",
       "0            other   6.0     12.0           15.6            8     DDR4   \n",
       "1             4060  24.0     32.0          other           16     DDR5   \n",
       "2             3050   8.0     16.0          other           16     DDR5   \n",
       "3             2050   6.0     12.0           15.6            8     DDR4   \n",
       "4             2050   6.0     12.0           15.6           16     DDR4   \n",
       "\n",
       "   operating_system SSD_storage  \n",
       "0              11.0  512 GB SSD  \n",
       "1              11.0    1 TB SSD  \n",
       "2              11.0    1 TB SSD  \n",
       "3              11.0  512 GB SSD  \n",
       "4              11.0  512 GB SSD  "
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"datasets/laptop.csv\")\n",
    "df.info()\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Проблемная область\n",
    "Данный датасет позволяет проанализировать данные, и понять какие ноутбуки имеют превосходство на рынке техники, и какие чаще всего выбирают пользователи.\n",
    "#### Анализ набора данных\n",
    "Объекты наблюдения - игровые ноутбуки\n",
    "Атрибуты - Имя бренда, цена, рейтинг, поколение процессора, марка процессора, сегмент процессора, специальная метка, производительность процессора, видеокарта, видеопамять, номер видеокарты, кол-во потоков видеокарты, размер дисплея, оперативная память, тип оперативной памяти, ОС, SSD.\n",
    "Связи между объектами - нет\n",
    "#### Бизнес-цели\n",
    "Данный набор данных может помочь определить лидеров на рынке игровых ноутбуков.\n",
    "В свою очередь определение лидеров поможет определить:\n",
    "1. Какие модели игровых ноутбуков более выгодно продавать магазинам техники.\n",
    "2. Какие комплектующие наиболее популярны у производителей и покупателей, для дальнейшего увеличения производства данных комплектующих.\n",
    "3. Определение популярных комлпектующих, для дальнейшей сборки других игровых ноутбуков новых версий.\n",
    "#### Примеры целей технического проекта. Что поступает на вход, что является целевым признаком?????????\n",
    "#### Проблемы набора данных и их решения\n",
    "1. Возможны устаревшие данные, т.к. новые комплектующие выходят довольно часто. Для решения данной проблемы требуется удаление самых старых записей о ноутбуках, и добавление более новых моделей.\n",
    "2. Возможны выбросы, какие-то \"сверхестественные сборки\". Решить эту проблему помогает удаление таких выбросов. В маленькой выборке не рекомендуется удалять выбросы. В большой выборке выбросы усредняются.\n",
    "#### Качество набора данных\n",
    "Набор данных содержит достаточно примеров и признаков для обучения модели. Учтены различные ситуации проблемной области. Данные соответствуют данным, которые будут\n",
    "подаваться в производственной среде. Все метки согласованы.\n",
    "#### Проблема пропущенных данных"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "processor_gen процент пустых значений: %2.26\n",
      "processor_segment процент пустых значений: %0.75\n",
      "Graphic_card_memory процент пустых значений: %0.38\n",
      "graphic_card_name процент пустых значений: %0.38\n",
      "Core процент пустых значений: %0.38\n",
      "threads процент пустых значений: %3.38\n",
      "operating_system процент пустых значений: %5.64\n"
     ]
    }
   ],
   "source": [
    "for i in df.columns:\n",
    "    null_rate = df[i].isnull().sum() / len(df)*100\n",
    "    if null_rate > 0:\n",
    "        print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "brand_name             False\n",
      "price                  False\n",
      "rating                 False\n",
      "processor_gen          False\n",
      "processor_brand        False\n",
      "processor_segment      False\n",
      "CPU_mark               False\n",
      "CPU_performance        False\n",
      "Graphic_card_memory    False\n",
      "graphic_card_name      False\n",
      "graphic_card_num       False\n",
      "Core                   False\n",
      "threads                False\n",
      "display_inches         False\n",
      "ram_storage            False\n",
      "ram_type               False\n",
      "operating_system       False\n",
      "SSD_storage            False\n",
      "dtype: bool\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>brand_name</th>\n",
       "      <th>price</th>\n",
       "      <th>rating</th>\n",
       "      <th>processor_gen</th>\n",
       "      <th>processor_brand</th>\n",
       "      <th>processor_segment</th>\n",
       "      <th>CPU_mark</th>\n",
       "      <th>CPU_performance</th>\n",
       "      <th>Graphic_card_memory</th>\n",
       "      <th>graphic_card_name</th>\n",
       "      <th>graphic_card_num</th>\n",
       "      <th>Core</th>\n",
       "      <th>threads</th>\n",
       "      <th>display_inches</th>\n",
       "      <th>ram_storage</th>\n",
       "      <th>ram_type</th>\n",
       "      <th>operating_system</th>\n",
       "      <th>SSD_storage</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>527</th>\n",
       "      <td>dell</td>\n",
       "      <td>75500</td>\n",
       "      <td>63</td>\n",
       "      <td>4th</td>\n",
       "      <td>amd</td>\n",
       "      <td>5</td>\n",
       "      <td>4600H</td>\n",
       "      <td>maximum performance</td>\n",
       "      <td>6 GB</td>\n",
       "      <td>amd radeon</td>\n",
       "      <td>other</td>\n",
       "      <td>6.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>15.6</td>\n",
       "      <td>8</td>\n",
       "      <td>DDR4</td>\n",
       "      <td>10.0</td>\n",
       "      <td>512 GB SSD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>528</th>\n",
       "      <td>lenovo</td>\n",
       "      <td>151990</td>\n",
       "      <td>75</td>\n",
       "      <td>10th</td>\n",
       "      <td>intel</td>\n",
       "      <td>i7</td>\n",
       "      <td>10875H</td>\n",
       "      <td>maximum performance</td>\n",
       "      <td>8 GB</td>\n",
       "      <td>nvidia geforce</td>\n",
       "      <td>other</td>\n",
       "      <td>8.0</td>\n",
       "      <td>16.0</td>\n",
       "      <td>15.6</td>\n",
       "      <td>16</td>\n",
       "      <td>DDR4</td>\n",
       "      <td>10.0</td>\n",
       "      <td>1 TB SSD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>529</th>\n",
       "      <td>lenovo</td>\n",
       "      <td>46500</td>\n",
       "      <td>48</td>\n",
       "      <td>8th</td>\n",
       "      <td>intel</td>\n",
       "      <td>i5</td>\n",
       "      <td>8250U</td>\n",
       "      <td>ultra-low power</td>\n",
       "      <td>Integrated</td>\n",
       "      <td>Intel Integrated</td>\n",
       "      <td>other</td>\n",
       "      <td>4.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>other</td>\n",
       "      <td>4</td>\n",
       "      <td>DDR4</td>\n",
       "      <td>0.0</td>\n",
       "      <td>other</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>530</th>\n",
       "      <td>msi</td>\n",
       "      <td>109990</td>\n",
       "      <td>61</td>\n",
       "      <td>9th</td>\n",
       "      <td>intel</td>\n",
       "      <td>i7</td>\n",
       "      <td>9750H</td>\n",
       "      <td>maximum performance</td>\n",
       "      <td>6 GB</td>\n",
       "      <td>nvidia geforce</td>\n",
       "      <td>other</td>\n",
       "      <td>6.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>other</td>\n",
       "      <td>8</td>\n",
       "      <td>other</td>\n",
       "      <td>0.0</td>\n",
       "      <td>other</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>531</th>\n",
       "      <td>hp</td>\n",
       "      <td>95800</td>\n",
       "      <td>70</td>\n",
       "      <td>9th</td>\n",
       "      <td>intel</td>\n",
       "      <td>i7</td>\n",
       "      <td>9750H</td>\n",
       "      <td>maximum performance</td>\n",
       "      <td>4 GB</td>\n",
       "      <td>nvidia geforce</td>\n",
       "      <td>1650</td>\n",
       "      <td>6.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>15.6</td>\n",
       "      <td>8</td>\n",
       "      <td>DDR4</td>\n",
       "      <td>0.0</td>\n",
       "      <td>other</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    brand_name   price  rating processor_gen processor_brand  \\\n",
       "527       dell   75500      63           4th             amd   \n",
       "528     lenovo  151990      75          10th           intel   \n",
       "529     lenovo   46500      48           8th           intel   \n",
       "530        msi  109990      61           9th           intel   \n",
       "531         hp   95800      70           9th           intel   \n",
       "\n",
       "    processor_segment CPU_mark      CPU_performance Graphic_card_memory  \\\n",
       "527                 5    4600H  maximum performance                6 GB   \n",
       "528                i7   10875H  maximum performance                8 GB   \n",
       "529                i5    8250U      ultra-low power          Integrated   \n",
       "530                i7    9750H  maximum performance                6 GB   \n",
       "531                i7    9750H  maximum performance                4 GB   \n",
       "\n",
       "    graphic_card_name graphic_card_num  Core  threads display_inches  \\\n",
       "527        amd radeon            other   6.0     12.0           15.6   \n",
       "528    nvidia geforce            other   8.0     16.0           15.6   \n",
       "529  Intel Integrated            other   4.0      8.0          other   \n",
       "530    nvidia geforce            other   6.0     12.0          other   \n",
       "531    nvidia geforce             1650   6.0     12.0           15.6   \n",
       "\n",
       "     ram_storage ram_type  operating_system SSD_storage  \n",
       "527            8     DDR4              10.0  512 GB SSD  \n",
       "528           16     DDR4              10.0    1 TB SSD  \n",
       "529            4     DDR4               0.0       other  \n",
       "530            8    other               0.0       other  \n",
       "531            8     DDR4               0.0       other  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = df.fillna(0) #Замена пустых значений на 0\n",
    "print(df.isnull().any())\n",
    "df.tail()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Разбиение на выборки"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[24], line 46\u001b[0m\n\u001b[0;32m     42\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test\n\u001b[0;32m     44\u001b[0m data \u001b[38;5;241m=\u001b[39m df[[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrating\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprice\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mram_storage\u001b[39m\u001b[38;5;124m\"\u001b[39m]]\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m---> 46\u001b[0m df_train, df_val, df_test \u001b[38;5;241m=\u001b[39m \u001b[43msplit_stratified_into_train_val_test\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m     47\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstratify_colname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrating\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_train\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.60\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_val\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_test\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\n\u001b[0;32m     48\u001b[0m \u001b[43m)\u001b[49m\n\u001b[0;32m     50\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_train\u001b[38;5;241m.\u001b[39mshape)\n\u001b[0;32m     52\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mКонтрольная выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_val\u001b[38;5;241m.\u001b[39mshape)\n",
      "Cell \u001b[1;32mIn[24], line 26\u001b[0m, in \u001b[0;36msplit_stratified_into_train_val_test\u001b[1;34m(df_input, stratify_colname, frac_train, frac_val, frac_test, random_state)\u001b[0m\n\u001b[0;32m     21\u001b[0m y \u001b[38;5;241m=\u001b[39m df_input[\n\u001b[0;32m     22\u001b[0m     [stratify_colname]\n\u001b[0;32m     23\u001b[0m ]  \u001b[38;5;66;03m# Dataframe of just the column on which to stratify.\u001b[39;00m\n\u001b[0;32m     25\u001b[0m \u001b[38;5;66;03m# Split original dataframe into train and temp dataframes.\u001b[39;00m\n\u001b[1;32m---> 26\u001b[0m df_train, df_temp, y_train, y_temp \u001b[38;5;241m=\u001b[39m \u001b[43mtrain_test_split\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m     27\u001b[0m \u001b[43m    \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstratify\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m1.0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mfrac_train\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrandom_state\u001b[49m\n\u001b[0;32m     28\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     30\u001b[0m \u001b[38;5;66;03m# Split the temp dataframe into val and test dataframes.\u001b[39;00m\n\u001b[0;32m     31\u001b[0m relative_frac_test \u001b[38;5;241m=\u001b[39m frac_test \u001b[38;5;241m/\u001b[39m (frac_val \u001b[38;5;241m+\u001b[39m frac_test)\n",
      "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:213\u001b[0m, in \u001b[0;36mvalidate_params.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    207\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m    208\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m    209\u001b[0m         skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m    210\u001b[0m             prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m    211\u001b[0m         )\n\u001b[0;32m    212\u001b[0m     ):\n\u001b[1;32m--> 213\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    214\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m    215\u001b[0m     \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m    216\u001b[0m     \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m    217\u001b[0m     \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m    218\u001b[0m     \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m    219\u001b[0m     msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m    220\u001b[0m         \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m    221\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m    222\u001b[0m         \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m    223\u001b[0m     )\n",
      "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:2806\u001b[0m, in \u001b[0;36mtrain_test_split\u001b[1;34m(test_size, train_size, random_state, shuffle, stratify, *arrays)\u001b[0m\n\u001b[0;32m   2802\u001b[0m         CVClass \u001b[38;5;241m=\u001b[39m ShuffleSplit\n\u001b[0;32m   2804\u001b[0m     cv \u001b[38;5;241m=\u001b[39m CVClass(test_size\u001b[38;5;241m=\u001b[39mn_test, train_size\u001b[38;5;241m=\u001b[39mn_train, random_state\u001b[38;5;241m=\u001b[39mrandom_state)\n\u001b[1;32m-> 2806\u001b[0m     train, test \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcv\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43marrays\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstratify\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   2808\u001b[0m train, test \u001b[38;5;241m=\u001b[39m ensure_common_namespace_device(arrays[\u001b[38;5;241m0\u001b[39m], train, test)\n\u001b[0;32m   2810\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(\n\u001b[0;32m   2811\u001b[0m     chain\u001b[38;5;241m.\u001b[39mfrom_iterable(\n\u001b[0;32m   2812\u001b[0m         (_safe_indexing(a, train), _safe_indexing(a, test)) \u001b[38;5;28;01mfor\u001b[39;00m a \u001b[38;5;129;01min\u001b[39;00m arrays\n\u001b[0;32m   2813\u001b[0m     )\n\u001b[0;32m   2814\u001b[0m )\n",
      "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:1843\u001b[0m, in \u001b[0;36mBaseShuffleSplit.split\u001b[1;34m(self, X, y, groups)\u001b[0m\n\u001b[0;32m   1813\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Generate indices to split data into training and test set.\u001b[39;00m\n\u001b[0;32m   1814\u001b[0m \n\u001b[0;32m   1815\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   1840\u001b[0m \u001b[38;5;124;03mto an integer.\u001b[39;00m\n\u001b[0;32m   1841\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m   1842\u001b[0m X, y, groups \u001b[38;5;241m=\u001b[39m indexable(X, y, groups)\n\u001b[1;32m-> 1843\u001b[0m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_iter_indices\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgroups\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[0;32m   1844\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43;01myield\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\n",
      "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:2252\u001b[0m, in \u001b[0;36mStratifiedShuffleSplit._iter_indices\u001b[1;34m(self, X, y, groups)\u001b[0m\n\u001b[0;32m   2250\u001b[0m class_counts \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mbincount(y_indices)\n\u001b[0;32m   2251\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39mmin(class_counts) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m2\u001b[39m:\n\u001b[1;32m-> 2252\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m   2253\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe least populated class in y has only 1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   2254\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m member, which is too few. The minimum\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   2255\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m number of groups for any class cannot\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   2256\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m be less than 2.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   2257\u001b[0m     )\n\u001b[0;32m   2259\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n_train \u001b[38;5;241m<\u001b[39m n_classes:\n\u001b[0;32m   2260\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m   2261\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe train_size = \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m should be greater or \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   2262\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mequal to the number of classes = \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m (n_train, n_classes)\n\u001b[0;32m   2263\u001b[0m     )\n",
      "\u001b[1;31mValueError\u001b[0m: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2."
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "def split_stratified_into_train_val_test(\n",
    "    df_input,\n",
    "    stratify_colname=\"y\",\n",
    "    frac_train=0.6,\n",
    "    frac_val=0.15,\n",
    "    frac_test=0.25,\n",
    "    random_state=None,\n",
    "):\n",
    "    if frac_train + frac_val + frac_test != 1.0:\n",
    "        raise ValueError(\n",
    "            \"fractions %f, %f, %f do not add up to 1.0\"\n",
    "            % (frac_train, frac_val, frac_test)\n",
    "        )\n",
    "\n",
    "    if stratify_colname not in df_input.columns:\n",
    "        raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
    "\n",
    "    X = df_input  # Contains all columns.\n",
    "    y = df_input[\n",
    "        [stratify_colname]\n",
    "    ]  # Dataframe of just the column on which to stratify.\n",
    "\n",
    "    # Split original dataframe into train and temp dataframes.\n",
    "    df_train, df_temp, y_train, y_temp = train_test_split(\n",
    "        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
    "    )\n",
    "\n",
    "    # Split the temp dataframe into val and test dataframes.\n",
    "    relative_frac_test = frac_test / (frac_val + frac_test)\n",
    "    df_val, df_test, y_val, y_test = train_test_split(\n",
    "        df_temp,\n",
    "        y_temp,\n",
    "        stratify=y_temp,\n",
    "        test_size=relative_frac_test,\n",
    "        random_state=random_state,\n",
    "    )\n",
    "\n",
    "    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
    "\n",
    "    return df_train, df_val, df_test\n",
    "\n",
    "data = df[[\"rating\", \"price\", \"ram_storage\"]].copy()\n",
    "\n",
    "df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
    "    data, stratify_colname=\"rating\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
    ")\n",
    "\n",
    "print(\"Обучающая выборка: \", df_train.shape)\n",
    "\n",
    "print(\"Контрольная выборка: \", df_val.shape)\n",
    "\n",
    "print(\"Тестовая выборка: \", df_test.shape)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "kernel",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}