Наконец то все (╥﹏╥)

This commit is contained in:
Максим Яковлев 2024-10-18 18:40:10 +04:00
parent fdbdb674e8
commit d19b7e0793

View File

@ -4,13 +4,13 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### DataSet - \"Gaming Laptop Specs and Price\"\n", "### Lab2 PIbd-31 Yakovlev\n",
"Данный датасет содержит данные о игровых ноутбуках." "Загрузим три датасета"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 110,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -41,201 +41,58 @@
" 16 operating_system 502 non-null float64\n", " 16 operating_system 502 non-null float64\n",
" 17 SSD_storage 532 non-null object \n", " 17 SSD_storage 532 non-null object \n",
"dtypes: float64(3), int64(3), object(12)\n", "dtypes: float64(3), int64(3), object(12)\n",
"memory usage: 74.9+ KB\n" "memory usage: 74.9+ KB\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 8036 entries, 0 to 8035\n",
"Data columns (total 7 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Date 8036 non-null object \n",
" 1 Open 8036 non-null float64\n",
" 2 High 8036 non-null float64\n",
" 3 Low 8036 non-null float64\n",
" 4 Close 8036 non-null float64\n",
" 5 Adj Close 8036 non-null float64\n",
" 6 Volume 8036 non-null int64 \n",
"dtypes: float64(5), int64(1), object(1)\n",
"memory usage: 439.6+ KB\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 19237 entries, 0 to 19236\n",
"Data columns (total 18 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 ID 19237 non-null int64 \n",
" 1 Price 19237 non-null int64 \n",
" 2 Levy 19237 non-null object \n",
" 3 Manufacturer 19237 non-null object \n",
" 4 Model 19237 non-null object \n",
" 5 Prod. year 19237 non-null int64 \n",
" 6 Category 19237 non-null object \n",
" 7 Leather interior 19237 non-null object \n",
" 8 Fuel type 19237 non-null object \n",
" 9 Engine volume 19237 non-null object \n",
" 10 Mileage 19237 non-null object \n",
" 11 Cylinders 19237 non-null float64\n",
" 12 Gear box type 19237 non-null object \n",
" 13 Drive wheels 19237 non-null object \n",
" 14 Doors 19237 non-null object \n",
" 15 Wheel 19237 non-null object \n",
" 16 Color 19237 non-null object \n",
" 17 Airbags 19237 non-null int64 \n",
"dtypes: float64(1), int64(4), object(13)\n",
"memory usage: 2.6+ MB\n"
] ]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>brand_name</th>\n",
" <th>price</th>\n",
" <th>rating</th>\n",
" <th>processor_gen</th>\n",
" <th>processor_brand</th>\n",
" <th>processor_segment</th>\n",
" <th>CPU_mark</th>\n",
" <th>CPU_performance</th>\n",
" <th>Graphic_card_memory</th>\n",
" <th>graphic_card_name</th>\n",
" <th>graphic_card_num</th>\n",
" <th>Core</th>\n",
" <th>threads</th>\n",
" <th>display_inches</th>\n",
" <th>ram_storage</th>\n",
" <th>ram_type</th>\n",
" <th>operating_system</th>\n",
" <th>SSD_storage</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>hp</td>\n",
" <td>49490</td>\n",
" <td>70</td>\n",
" <td>5th</td>\n",
" <td>amd</td>\n",
" <td>5</td>\n",
" <td>5600H</td>\n",
" <td>maximum performance</td>\n",
" <td>4GB</td>\n",
" <td>amd radeon</td>\n",
" <td>other</td>\n",
" <td>6.0</td>\n",
" <td>12.0</td>\n",
" <td>15.6</td>\n",
" <td>8</td>\n",
" <td>DDR4</td>\n",
" <td>11.0</td>\n",
" <td>512GB SSD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>xiaomi</td>\n",
" <td>102990</td>\n",
" <td>78</td>\n",
" <td>14th</td>\n",
" <td>intel</td>\n",
" <td>i9</td>\n",
" <td>14900HX</td>\n",
" <td>maximum performance</td>\n",
" <td>8GB</td>\n",
" <td>nvidia geforce</td>\n",
" <td>4060</td>\n",
" <td>24.0</td>\n",
" <td>32.0</td>\n",
" <td>other</td>\n",
" <td>16</td>\n",
" <td>DDR5</td>\n",
" <td>11.0</td>\n",
" <td>1TB SSD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>hp</td>\n",
" <td>81490</td>\n",
" <td>73</td>\n",
" <td>7th</td>\n",
" <td>amd</td>\n",
" <td>7</td>\n",
" <td>7840HS</td>\n",
" <td>high efficiency</td>\n",
" <td>6GB</td>\n",
" <td>nvidia geforce</td>\n",
" <td>3050</td>\n",
" <td>8.0</td>\n",
" <td>16.0</td>\n",
" <td>other</td>\n",
" <td>16</td>\n",
" <td>DDR5</td>\n",
" <td>11.0</td>\n",
" <td>1TB SSD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>asus</td>\n",
" <td>49990</td>\n",
" <td>64</td>\n",
" <td>11th</td>\n",
" <td>intel</td>\n",
" <td>i5</td>\n",
" <td>11400H</td>\n",
" <td>maximum performance</td>\n",
" <td>4GB</td>\n",
" <td>nvidia geforce</td>\n",
" <td>2050</td>\n",
" <td>6.0</td>\n",
" <td>12.0</td>\n",
" <td>15.6</td>\n",
" <td>8</td>\n",
" <td>DDR4</td>\n",
" <td>11.0</td>\n",
" <td>512GB SSD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>asus</td>\n",
" <td>52990</td>\n",
" <td>66</td>\n",
" <td>11th</td>\n",
" <td>intel</td>\n",
" <td>i5</td>\n",
" <td>11400H</td>\n",
" <td>maximum performance</td>\n",
" <td>4GB</td>\n",
" <td>nvidia geforce</td>\n",
" <td>2050</td>\n",
" <td>6.0</td>\n",
" <td>12.0</td>\n",
" <td>15.6</td>\n",
" <td>16</td>\n",
" <td>DDR4</td>\n",
" <td>11.0</td>\n",
" <td>512GB SSD</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" brand_name price rating processor_gen processor_brand processor_segment \\\n",
"0 hp 49490 70 5th amd 5 \n",
"1 xiaomi 102990 78 14th intel i9 \n",
"2 hp 81490 73 7th amd 7 \n",
"3 asus 49990 64 11th intel i5 \n",
"4 asus 52990 66 11th intel i5 \n",
"\n",
" CPU_mark CPU_performance Graphic_card_memory graphic_card_name \\\n",
"0 5600H maximum performance 4GB amd radeon \n",
"1 14900HX maximum performance 8GB nvidia geforce \n",
"2 7840HS high efficiency 6GB nvidia geforce \n",
"3 11400H maximum performance 4GB nvidia geforce \n",
"4 11400H maximum performance 4GB nvidia geforce \n",
"\n",
" graphic_card_num Core threads display_inches ram_storage ram_type \\\n",
"0 other 6.0 12.0 15.6 8 DDR4 \n",
"1 4060 24.0 32.0 other 16 DDR5 \n",
"2 3050 8.0 16.0 other 16 DDR5 \n",
"3 2050 6.0 12.0 15.6 8 DDR4 \n",
"4 2050 6.0 12.0 15.6 16 DDR4 \n",
"\n",
" operating_system SSD_storage \n",
"0 11.0 512GB SSD \n",
"1 11.0 1TB SSD \n",
"2 11.0 1TB SSD \n",
"3 11.0 512GB SSD \n",
"4 11.0 512GB SSD "
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
} }
], ],
"source": [ "source": [
"import pandas as pd\n", "import pandas as pd\n",
"\n", "\n",
"df = pd.read_csv(\"datasets/laptop.csv\")\n", "df = pd.read_csv(\"datasets/laptop.csv\")\n",
"df2 = pd.read_csv(\"datasets/coffee.csv\")\n",
"df3 = pd.read_csv(\"datasets/car_price_prediction.csv\")\n",
"df.info()\n", "df.info()\n",
"df.head()" "df2.info()\n",
"df3.info()"
] ]
}, },
{ {
@ -243,56 +100,169 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"#### Проблемная область\n", "#### Проблемная область\n",
"Данный датасет позволяет проанализировать данные, и понять какие ноутбуки имеют превосходство на рынке техники, и какие чаще всего выбирают пользователи.\n", "Первый датасет позволяет проанализировать данные, и понять какие ноутбуки имеют превосходство на рынке техники, и какие чаще всего выбирают пользователи.\n",
"Второй датасет позволяет при помощи данных акций за последние 25 лет спрогнозировать будущие показатели акций кофейни Starbucks\n",
"Третий датасет позволяет проанализировать данные, и спрогнозировать категорию цены для машины, по ее комплектующим.\n",
"#### Анализ набора данных\n", "#### Анализ набора данных\n",
"Объекты наблюдения - игровые ноутбуки\n", "Объекты наблюдения - игровые ноутбуки, акции, машины\n",
"Атрибуты - Имя бренда, цена, рейтинг, поколение процессора, марка процессора, сегмент процессора, специальная метка, производительность процессора, видеокарта, видеопамять, номер видеокарты, кол-во потоков видеокарты, размер дисплея, оперативная память, тип оперативной памяти, ОС, SSD.\n", "Атрибуты - \n",
"1. Имя бренда, цена, рейтинг, поколение процессора, марка процессора, сегмент процессора, специальная метка, производительность процессора, видеокарта, видеопамять, номер видеокарты, кол-во потоков видеокарты, размер дисплея, оперативная память, тип оперативной памяти, ОС, SSD.\n",
"2. Дата, начальная цена за день, максимальная цена, минимальная цена, цена на момент закрытия продаж, скорректированая цена на момент закрытия, объем торговли акций за день.\n",
"3. Цена обслуживания, производитель, модель, год выпуска, категория, кожанный салон, тип топлива, объем двигателя.\n",
"Связи между объектами - нет\n", "Связи между объектами - нет\n",
"#### Бизнес-цели\n", "#### Бизнес-цели\n",
"Данный набор данных может помочь определить лидеров на рынке игровых ноутбуков.\n", "1. Какие модели игровых ноутбуков более выгодно продавать магазинам техники. Какие комплектующие наиболее популярны у производителей и покупателей, для дальнейшего увеличения производства данных комплектующих.\n",
"В свою очередь определение лидеров поможет определить:\n", "2. Прогноз цен, для дальнешей покупки, продажи акций. Прогнозирование, для предотвращения упадка.\n",
"1. Какие модели игровых ноутбуков более выгодно продавать магазинам техники.\n", "3. Для составления списка лучших моделей автомобилей. Определения наилучшего буджетного автомобиля, который не будет часто ломаться и приносить убытки.\n",
"2. Какие комплектующие наиболее популярны у производителей и покупателей, для дальнейшего увеличения производства данных комплектующих.\n", "#### Примеры целей технического проекта. Что поступает на вход, что является целевым признаком.\n",
"3. Определение популярных комлпектующих, для дальнейшей сборки других игровых ноутбуков новых версий.\n", "На входе всегда датасет, целевые признаки:\n",
"#### Примеры целей технического проекта. Что поступает на вход, что является целевым признаком?????????\n", "1. Рейтинг ноутбука\n",
"2. Максимальная цена за день\n",
"3. Цена обслуживания\n",
"#### Проблемы набора данных и их решения\n", "#### Проблемы набора данных и их решения\n",
"1. Возможны устаревшие данные, т.к. новые комплектующие выходят довольно часто. Для решения данной проблемы требуется удаление самых старых записей о ноутбуках, и добавление более новых моделей.\n", "1. Возможны устаревшие данные. Для решения данной проблемы требуется удаление самых старых записей, и добавление более новых.\n",
"2. Возможны выбросы, какие-то \"сверхестественные сборки\". Решить эту проблему помогает удаление таких выбросов. В маленькой выборке не рекомендуется удалять выбросы. В большой выборке выбросы усредняются.\n", "2. Возможны выбросы. Решить эту проблему помогает удаление таких выбросов. В маленькой выборке не рекомендуется удалять выбросы. В большой выборке выбросы усредняются.\n",
"#### Качество набора данных\n", "#### Качество набора данных\n",
"Набор данных содержит достаточно примеров и признаков для обучения модели. Учтены различные ситуации проблемной области. Данные соответствуют данным, которые будут\n", "Наборы данных содержат достаточно примеров и признаков для обучения модели. Учтены различные ситуации проблемной области. Данные соответствуют данным, которые будут\n",
"подаваться в производственной среде. Все метки согласованы.\n", "подаваться в производственной среде. Все метки согласованы.\n"
"#### Проблема пропущенных данных" ]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Поиск аномалий"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 111,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
" price rating Core threads ram_storage \\\n",
"count 532.000000 532.000000 530.000000 514.000000 532.000000 \n",
"mean 107684.492481 67.781955 9.035849 15.089494 15.676692 \n",
"std 80187.648965 8.161356 4.413487 5.216162 8.901257 \n",
"min 30999.000000 43.000000 2.000000 4.000000 4.000000 \n",
"25% 62371.750000 63.000000 6.000000 12.000000 8.000000 \n",
"50% 83745.000000 66.000000 8.000000 16.000000 16.000000 \n",
"75% 114040.000000 72.000000 10.000000 16.000000 16.000000 \n",
"max 599990.000000 98.000000 24.000000 32.000000 64.000000 \n",
"\n",
" operating_system \n",
"count 502.000000 \n",
"mean 10.842629 \n",
"std 0.364513 \n",
"min 10.000000 \n",
"25% 11.000000 \n",
"50% 11.000000 \n",
"75% 11.000000 \n",
"max 11.000000 \n",
" Open High Low Close Adj Close \\\n",
"count 8036.000000 8036.000000 8036.000000 8036.000000 8036.000000 \n",
"mean 30.054280 30.351487 29.751322 30.058857 26.674025 \n",
"std 33.615577 33.906613 33.314569 33.615911 31.728090 \n",
"min 0.328125 0.347656 0.320313 0.335938 0.260703 \n",
"25% 4.392031 4.531250 4.304922 4.399610 3.414300 \n",
"50% 13.325000 13.493750 13.150000 13.330000 10.352452 \n",
"75% 55.250000 55.722501 54.852499 55.267499 47.464829 \n",
"max 126.080002 126.320000 124.809998 126.059998 118.010414 \n",
"\n",
" Volume \n",
"count 8.036000e+03 \n",
"mean 1.470459e+07 \n",
"std 1.340021e+07 \n",
"min 1.504000e+06 \n",
"25% 7.817750e+06 \n",
"50% 1.169815e+07 \n",
"75% 1.778795e+07 \n",
"max 5.855088e+08 \n",
" ID Price Prod. year Cylinders Airbags\n",
"count 1.923700e+04 1.923700e+04 19237.000000 19237.000000 19237.000000\n",
"mean 4.557654e+07 1.855593e+04 2010.912824 4.582991 6.582627\n",
"std 9.365914e+05 1.905813e+05 5.668673 1.199933 4.320168\n",
"min 2.074688e+07 1.000000e+00 1939.000000 1.000000 0.000000\n",
"25% 4.569837e+07 5.331000e+03 2009.000000 4.000000 4.000000\n",
"50% 4.577231e+07 1.317200e+04 2012.000000 4.000000 6.000000\n",
"75% 4.580204e+07 2.207500e+04 2015.000000 4.000000 12.000000\n",
"max 4.581665e+07 2.630750e+07 2020.000000 16.000000 16.000000\n"
]
}
],
"source": [
"print(df.describe())\n",
"print(df2.describe())\n",
"print(df3.describe())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"При просмотре вывода не было замечено аномалий в столбцах датасетов."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Проблема пропущенных данных"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DATASET 1\n",
"processor_gen процент пустых значений: %2.26\n", "processor_gen процент пустых значений: %2.26\n",
"processor_segment процент пустых значений: %0.75\n", "processor_segment процент пустых значений: %0.75\n",
"Graphic_card_memory процент пустых значений: %0.38\n", "Graphic_card_memory процент пустых значений: %0.38\n",
"graphic_card_name процент пустых значений: %0.38\n", "graphic_card_name процент пустых значений: %0.38\n",
"Core процент пустых значений: %0.38\n", "Core процент пустых значений: %0.38\n",
"threads процент пустых значений: %3.38\n", "threads процент пустых значений: %3.38\n",
"operating_system процент пустых значений: %5.64\n" "operating_system процент пустых значений: %5.64\n",
"DATASET 2\n",
"DATASET 3\n"
] ]
} }
], ],
"source": [ "source": [
"print(\"DATASET 1\")\n",
"for i in df.columns:\n", "for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df)*100\n", " null_rate = df[i].isnull().sum() / len(df)*100\n",
" if null_rate > 0:\n", " if null_rate > 0:\n",
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n",
"print(\"DATASET 2\")\n",
"for i in df2.columns:\n",
" null_rate = df2[i].isnull().sum() / len(df2)*100\n",
" if null_rate > 0:\n",
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n",
"print(\"DATASET 3\")\n",
"for i in df3.columns:\n",
" null_rate = df3[i].isnull().sum() / len(df3)*100\n",
" if null_rate > 0:\n",
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")" " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"В первом датасете были поля с пустыми значениями, в остальных пустых значений не найдено."
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 113,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -501,7 +471,7 @@
"531 8 DDR4 0.0 other " "531 8 DDR4 0.0 other "
] ]
}, },
"execution_count": 10, "execution_count": 113,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -516,31 +486,31 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"#### Разбиение на выборки" "#### Разбиение на выборки\n",
"Для разбиения на выборке для начала уменьшим количество уникальных значений в столбцах с целевыми признаками, путем добавления новых столбцов с малым количеством уникальных значений."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 24, "execution_count": 114,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{ "source": [
"ename": "ValueError", "#У первого дата сета добавим новый столбец с рейтингом от 1 до 5 на основе столбца от 1 до 100.\n",
"evalue": "The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.", "df['new_rating'] = pd.cut(df['rating'], bins=[0,20,40,60,80,100], labels=[1,2,3,4,5], include_lowest=True)\n",
"output_type": "error", "#У второго добавим столбец с наибольшей ценой от 1 до 10, на основе столбца от 1 до 127.\n",
"traceback": [ "df2['new_high'] = pd.cut(df2['High'], bins=[0,13,26,39,52,65,78,91,104,117,130], labels=[1,2,3,4,5,6,7,8,9,10], include_lowest=True)\n",
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "#У третьего удалим слишком большие значения обслуживания и слишком маленькие и добавим новый столбец с категориями цен от 1 до 5.\n",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", "df3_filtered = df3[df3['Price'] >= 10000]\n",
"Cell \u001b[1;32mIn[24], line 46\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test\n\u001b[0;32m 44\u001b[0m data \u001b[38;5;241m=\u001b[39m df[[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrating\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprice\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mram_storage\u001b[39m\u001b[38;5;124m\"\u001b[39m]]\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m---> 46\u001b[0m df_train, df_val, df_test \u001b[38;5;241m=\u001b[39m \u001b[43msplit_stratified_into_train_val_test\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 47\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstratify_colname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrating\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_train\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.60\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_val\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_test\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\n\u001b[0;32m 48\u001b[0m \u001b[43m)\u001b[49m\n\u001b[0;32m 50\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_train\u001b[38;5;241m.\u001b[39mshape)\n\u001b[0;32m 52\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mКонтрольная выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_val\u001b[38;5;241m.\u001b[39mshape)\n", "df3_filtered = df3_filtered[df3_filtered['Price'] <= 100000]\n",
"Cell \u001b[1;32mIn[24], line 26\u001b[0m, in \u001b[0;36msplit_stratified_into_train_val_test\u001b[1;34m(df_input, stratify_colname, frac_train, frac_val, frac_test, random_state)\u001b[0m\n\u001b[0;32m 21\u001b[0m y \u001b[38;5;241m=\u001b[39m df_input[\n\u001b[0;32m 22\u001b[0m [stratify_colname]\n\u001b[0;32m 23\u001b[0m ] \u001b[38;5;66;03m# Dataframe of just the column on which to stratify.\u001b[39;00m\n\u001b[0;32m 25\u001b[0m \u001b[38;5;66;03m# Split original dataframe into train and temp dataframes.\u001b[39;00m\n\u001b[1;32m---> 26\u001b[0m df_train, df_temp, y_train, y_temp \u001b[38;5;241m=\u001b[39m \u001b[43mtrain_test_split\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 27\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstratify\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m1.0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mfrac_train\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrandom_state\u001b[49m\n\u001b[0;32m 28\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 30\u001b[0m \u001b[38;5;66;03m# Split the temp dataframe into val and test dataframes.\u001b[39;00m\n\u001b[0;32m 31\u001b[0m relative_frac_test \u001b[38;5;241m=\u001b[39m frac_test \u001b[38;5;241m/\u001b[39m (frac_val \u001b[38;5;241m+\u001b[39m frac_test)\n", "df3_filtered['new_price'] = pd.cut(df3_filtered['Price'], bins=[10000,28000,46000,64000,82000,100000], labels=[1,2,3,4,5], include_lowest=True)"
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:213\u001b[0m, in \u001b[0;36mvalidate_params.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 208\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 209\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 210\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 211\u001b[0m )\n\u001b[0;32m 212\u001b[0m ):\n\u001b[1;32m--> 213\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 215\u001b[0m \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m 216\u001b[0m \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m 217\u001b[0m \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m 218\u001b[0m \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m 219\u001b[0m msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m 220\u001b[0m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 221\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 222\u001b[0m \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m 223\u001b[0m )\n",
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:2806\u001b[0m, in \u001b[0;36mtrain_test_split\u001b[1;34m(test_size, train_size, random_state, shuffle, stratify, *arrays)\u001b[0m\n\u001b[0;32m 2802\u001b[0m CVClass \u001b[38;5;241m=\u001b[39m ShuffleSplit\n\u001b[0;32m 2804\u001b[0m cv \u001b[38;5;241m=\u001b[39m CVClass(test_size\u001b[38;5;241m=\u001b[39mn_test, train_size\u001b[38;5;241m=\u001b[39mn_train, random_state\u001b[38;5;241m=\u001b[39mrandom_state)\n\u001b[1;32m-> 2806\u001b[0m train, test \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcv\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43marrays\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstratify\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2808\u001b[0m train, test \u001b[38;5;241m=\u001b[39m ensure_common_namespace_device(arrays[\u001b[38;5;241m0\u001b[39m], train, test)\n\u001b[0;32m 2810\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(\n\u001b[0;32m 2811\u001b[0m chain\u001b[38;5;241m.\u001b[39mfrom_iterable(\n\u001b[0;32m 2812\u001b[0m (_safe_indexing(a, train), _safe_indexing(a, test)) \u001b[38;5;28;01mfor\u001b[39;00m a \u001b[38;5;129;01min\u001b[39;00m arrays\n\u001b[0;32m 2813\u001b[0m )\n\u001b[0;32m 2814\u001b[0m )\n",
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:1843\u001b[0m, in \u001b[0;36mBaseShuffleSplit.split\u001b[1;34m(self, X, y, groups)\u001b[0m\n\u001b[0;32m 1813\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Generate indices to split data into training and test set.\u001b[39;00m\n\u001b[0;32m 1814\u001b[0m \n\u001b[0;32m 1815\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1840\u001b[0m \u001b[38;5;124;03mto an integer.\u001b[39;00m\n\u001b[0;32m 1841\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 1842\u001b[0m X, y, groups \u001b[38;5;241m=\u001b[39m indexable(X, y, groups)\n\u001b[1;32m-> 1843\u001b[0m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_iter_indices\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgroups\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[0;32m 1844\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01myield\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\n",
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:2252\u001b[0m, in \u001b[0;36mStratifiedShuffleSplit._iter_indices\u001b[1;34m(self, X, y, groups)\u001b[0m\n\u001b[0;32m 2250\u001b[0m class_counts \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mbincount(y_indices)\n\u001b[0;32m 2251\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39mmin(class_counts) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m2\u001b[39m:\n\u001b[1;32m-> 2252\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 2253\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe least populated class in y has only 1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2254\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m member, which is too few. The minimum\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2255\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m number of groups for any class cannot\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2256\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m be less than 2.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2257\u001b[0m )\n\u001b[0;32m 2259\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n_train \u001b[38;5;241m<\u001b[39m n_classes:\n\u001b[0;32m 2260\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 2261\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe train_size = \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m should be greater or \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2262\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mequal to the number of classes = \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m (n_train, n_classes)\n\u001b[0;32m 2263\u001b[0m )\n",
"\u001b[1;31mValueError\u001b[0m: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2."
] ]
} },
], {
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [],
"source": [ "source": [
"from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import train_test_split\n",
"\n", "\n",
@ -583,19 +553,234 @@
"\n", "\n",
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
"\n", "\n",
" return df_train, df_val, df_test\n", " return df_train, df_val, df_test"
"\n", ]
"data = df[[\"rating\", \"price\", \"ram_storage\"]].copy()\n", },
"\n", {
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n", "cell_type": "markdown",
" data, stratify_colname=\"rating\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", "metadata": {},
"source": [
"### Выборки датасетов"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"train df: (319, 19), Val df: (106, 19), Test df:(107, 19)\n",
"train df2: (4821, 8), Val df2: (1607, 8), Test df2:(1608, 8)\n",
"train df3_filtered: (6931, 19), Val df3_filtered: (2310, 19), Test df3_filtered:(2311, 19)\n"
]
}
],
"source": [
"df_train1, df_val1, df_test1 = split_stratified_into_train_val_test(\n",
" df, stratify_colname=\"new_rating\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
")\n", ")\n",
"\n", "\n",
"print(\"Обучающая выборка: \", df_train.shape)\n", "df_train2, df_val2, df_test2 = split_stratified_into_train_val_test(\n",
" df2, stratify_colname=\"new_high\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
")\n",
"\n", "\n",
"print(\"Контрольная выборка: \", df_val.shape)\n", "df_train3, df_val3, df_test3 = split_stratified_into_train_val_test(\n",
" df3_filtered, stratify_colname=\"new_price\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
")\n",
"\n", "\n",
"print(\"Тестовая выборка: \", df_test.shape)" "print(f\"train df: {df_train1.shape}, Val df: {df_val1.shape}, Test df:{df_test1.shape}\")\n",
"print(f\"train df2: {df_train2.shape}, Val df2: {df_val2.shape}, Test df2:{df_test2.shape}\")\n",
"print(f\"train df3_filtered: {df_train3.shape}, Val df3_filtered: {df_val3.shape}, Test df3_filtered:{df_test3.shape}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Было сделано разбиение на три выборки: 60%, 20% и 20% при помощи библиотеки scikit-learn и функции train_test_split. На взгляд сбалансированные\n",
"### Приращение методами выборки с избытком (oversampling) и выборки с недостатком (undersampling)"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Выборка до oversampling и undersampling: (319, 6)\n",
"new_rating\n",
"4 249\n",
"3 44\n",
"5 26\n",
"1 0\n",
"2 0\n",
"Name: count, dtype: int64\n",
"Выборка после oversampling: (748, 6)\n",
"new_rating\n",
"5 252\n",
"4 249\n",
"3 247\n",
"1 0\n",
"2 0\n",
"Name: count, dtype: int64\n",
"Выборка после undersampling: (78, 6)\n",
"new_rating\n",
"3 26\n",
"5 26\n",
"4 26\n",
"1 0\n",
"2 0\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"from imblearn.over_sampling import ADASYN\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"\n",
"df_train1 = df_train1[['price', 'rating', 'threads', 'ram_storage', 'operating_system', 'new_rating']].copy()\n",
"\n",
"ada = ADASYN()\n",
"undersampler = RandomUnderSampler(random_state=42)\n",
"\n",
"print(\"Выборка до oversampling и undersampling:\", df_train1.shape)\n",
"print(df_train1.new_rating.value_counts())\n",
"\n",
"X_resampled, y_resampled = ada.fit_resample(df_train1, df_train1['new_rating'])\n",
"df_train1_adasyn = pd.DataFrame(X_resampled)\n",
"\n",
"print(\"Выборка после oversampling: \", df_train1_adasyn.shape)\n",
"print(df_train1_adasyn.new_rating.value_counts())\n",
"\n",
"X_resampled_under, y_resampled_under = undersampler.fit_resample(df_train1, df_train1['new_rating'])\n",
"\n",
"print(\"Выборка после undersampling: \", pd.DataFrame(X_resampled_under).shape)\n",
"print(pd.DataFrame(X_resampled_under).new_rating.value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Выборка до oversampling: (4821, 7)\n",
"new_high\n",
"1 2326\n",
"2 704\n",
"5 519\n",
"3 299\n",
"8 242\n",
"7 222\n",
"9 181\n",
"4 151\n",
"6 146\n",
"10 31\n",
"Name: count, dtype: int64\n",
"Выборка после oversampling: (22990, 7)\n",
"new_high\n",
"6 2375\n",
"2 2353\n",
"10 2327\n",
"1 2326\n",
"9 2306\n",
"8 2296\n",
"4 2293\n",
"3 2265\n",
"7 2254\n",
"5 2195\n",
"Name: count, dtype: int64\n",
"Выборка после undersampling: (310, 7)\n",
"new_high\n",
"1 31\n",
"2 31\n",
"3 31\n",
"4 31\n",
"5 31\n",
"6 31\n",
"7 31\n",
"8 31\n",
"9 31\n",
"10 31\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"df_train2 = df_train2[['Open', 'High', 'new_high', 'Low', 'Close', 'Adj Close', 'Volume']].copy()\n",
"\n",
"print(\"Выборка до oversampling и undersampling:\", df_train2.shape)\n",
"print(df_train2.new_high.value_counts())\n",
"\n",
"X_resampled, y_resampled = ada.fit_resample(df_train2, df_train2['new_high'])\n",
"df_train2_adasyn = pd.DataFrame(X_resampled)\n",
"\n",
"print(\"Выборка после oversampling: \", df_train2_adasyn.shape)\n",
"print(df_train2_adasyn.new_high.value_counts())\n",
"\n",
"X_resampled_under2, y_resampled_under2 = undersampler.fit_resample(df_train2, df_train2['new_high'])\n",
"\n",
"print(\"Выборка после undersampling: \", pd.DataFrame(X_resampled_under2).shape)\n",
"print(pd.DataFrame(X_resampled_under2).new_high.value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Выборка до oversampling: (6931, 5)\n",
"new_price\n",
"1 5008\n",
"2 1281\n",
"3 449\n",
"4 136\n",
"5 57\n",
"Name: count, dtype: int64\n",
"Выборка после oversampling: (25040, 5)\n",
"new_price\n",
"1 5008\n",
"2 5008\n",
"3 5008\n",
"4 5008\n",
"5 5008\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"from imblearn.over_sampling import SMOTE\n",
"\n",
"df_train3 = df_train3[['Price', 'new_price','Prod. year' ,'Cylinders' ,'Airbags']].copy()\n",
"\n",
"smote = SMOTE(random_state=42)\n",
"\n",
"print(\"Выборка до oversampling и undersampling:\", df_train3.shape)\n",
"print(df_train3.new_price.value_counts())\n",
"\n",
"X_resampled, y_resampled = smote.fit_resample(df_train3, df_train3['new_price'])\n",
"df_train3_smote = pd.DataFrame(X_resampled)\n",
"\n",
"print(\"Выборка после oversampling: \", df_train3_smote.shape)\n",
"print(df_train3_smote.new_price.value_counts())\n",
"\n",
"X_resampled_under3, y_resampled_under3 = undersampler.fit_resample(df_train3, df_train3['new_price'])\n",
"\n",
"print(\"Выборка после undersampling: \", pd.DataFrame(X_resampled_under3).shape)\n",
"print(pd.DataFrame(X_resampled_under3).new_price.value_counts())"
] ]
} }
], ],