697 lines
33 KiB
Plaintext
697 lines
33 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Lab2 PIbd-31 Lobashov\n",
|
||
"Три датасета:\n",
|
||
"1. Цена на автомобили (17 вариант)\n",
|
||
"2. Магазины (9 вариант)\n",
|
||
"3. Цены на золото (14 вариант)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 44,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 19237 entries, 0 to 19236\n",
|
||
"Data columns (total 18 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 ID 19237 non-null int64 \n",
|
||
" 1 Price 19237 non-null int64 \n",
|
||
" 2 Levy 19237 non-null int64 \n",
|
||
" 3 Manufacturer 19237 non-null object \n",
|
||
" 4 Model 19237 non-null object \n",
|
||
" 5 Prod. year 19237 non-null int64 \n",
|
||
" 6 Category 19237 non-null object \n",
|
||
" 7 Leather interior 19237 non-null object \n",
|
||
" 8 Fuel type 19237 non-null object \n",
|
||
" 9 Engine volume 19237 non-null object \n",
|
||
" 10 Mileage 19237 non-null int64 \n",
|
||
" 11 Cylinders 19237 non-null float64\n",
|
||
" 12 Gear box type 19237 non-null object \n",
|
||
" 13 Drive wheels 19237 non-null object \n",
|
||
" 14 Doors 19237 non-null object \n",
|
||
" 15 Wheel 19237 non-null object \n",
|
||
" 16 Color 19237 non-null object \n",
|
||
" 17 Airbags 19237 non-null int64 \n",
|
||
"dtypes: float64(1), int64(6), object(11)\n",
|
||
"memory usage: 2.6+ MB\n",
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 896 entries, 0 to 895\n",
|
||
"Data columns (total 5 columns):\n",
|
||
" # Column Non-Null Count Dtype\n",
|
||
"--- ------ -------------- -----\n",
|
||
" 0 Store ID 896 non-null int64\n",
|
||
" 1 Store_Area 896 non-null int64\n",
|
||
" 2 Items_Available 896 non-null int64\n",
|
||
" 3 Daily_Customer_Count 896 non-null int64\n",
|
||
" 4 Store_Sales 896 non-null int64\n",
|
||
"dtypes: int64(5)\n",
|
||
"memory usage: 35.1 KB\n",
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 1718 entries, 0 to 1717\n",
|
||
"Data columns (total 81 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 Date 1718 non-null object \n",
|
||
" 1 Open 1718 non-null float64\n",
|
||
" 2 High 1718 non-null float64\n",
|
||
" 3 Low 1718 non-null float64\n",
|
||
" 4 Close 1718 non-null float64\n",
|
||
" 5 Adj Close 1718 non-null float64\n",
|
||
" 6 Volume 1718 non-null int64 \n",
|
||
" 7 SP_open 1718 non-null float64\n",
|
||
" 8 SP_high 1718 non-null float64\n",
|
||
" 9 SP_low 1718 non-null float64\n",
|
||
" 10 SP_close 1718 non-null float64\n",
|
||
" 11 SP_Ajclose 1718 non-null float64\n",
|
||
" 12 SP_volume 1718 non-null int64 \n",
|
||
" 13 DJ_open 1718 non-null float64\n",
|
||
" 14 DJ_high 1718 non-null float64\n",
|
||
" 15 DJ_low 1718 non-null float64\n",
|
||
" 16 DJ_close 1718 non-null float64\n",
|
||
" 17 DJ_Ajclose 1718 non-null float64\n",
|
||
" 18 DJ_volume 1718 non-null int64 \n",
|
||
" 19 EG_open 1718 non-null float64\n",
|
||
" 20 EG_high 1718 non-null float64\n",
|
||
" 21 EG_low 1718 non-null float64\n",
|
||
" 22 EG_close 1718 non-null float64\n",
|
||
" 23 EG_Ajclose 1718 non-null float64\n",
|
||
" 24 EG_volume 1718 non-null int64 \n",
|
||
" 25 EU_Price 1718 non-null float64\n",
|
||
" 26 EU_open 1718 non-null float64\n",
|
||
" 27 EU_high 1718 non-null float64\n",
|
||
" 28 EU_low 1718 non-null float64\n",
|
||
" 29 EU_Trend 1718 non-null int64 \n",
|
||
" 30 OF_Price 1718 non-null float64\n",
|
||
" 31 OF_Open 1718 non-null float64\n",
|
||
" 32 OF_High 1718 non-null float64\n",
|
||
" 33 OF_Low 1718 non-null float64\n",
|
||
" 34 OF_Volume 1718 non-null int64 \n",
|
||
" 35 OF_Trend 1718 non-null int64 \n",
|
||
" 36 OS_Price 1718 non-null float64\n",
|
||
" 37 OS_Open 1718 non-null float64\n",
|
||
" 38 OS_High 1718 non-null float64\n",
|
||
" 39 OS_Low 1718 non-null float64\n",
|
||
" 40 OS_Trend 1718 non-null int64 \n",
|
||
" 41 SF_Price 1718 non-null int64 \n",
|
||
" 42 SF_Open 1718 non-null int64 \n",
|
||
" 43 SF_High 1718 non-null int64 \n",
|
||
" 44 SF_Low 1718 non-null int64 \n",
|
||
" 45 SF_Volume 1718 non-null int64 \n",
|
||
" 46 SF_Trend 1718 non-null int64 \n",
|
||
" 47 USB_Price 1718 non-null float64\n",
|
||
" 48 USB_Open 1718 non-null float64\n",
|
||
" 49 USB_High 1718 non-null float64\n",
|
||
" 50 USB_Low 1718 non-null float64\n",
|
||
" 51 USB_Trend 1718 non-null int64 \n",
|
||
" 52 PLT_Price 1718 non-null float64\n",
|
||
" 53 PLT_Open 1718 non-null float64\n",
|
||
" 54 PLT_High 1718 non-null float64\n",
|
||
" 55 PLT_Low 1718 non-null float64\n",
|
||
" 56 PLT_Trend 1718 non-null int64 \n",
|
||
" 57 PLD_Price 1718 non-null float64\n",
|
||
" 58 PLD_Open 1718 non-null float64\n",
|
||
" 59 PLD_High 1718 non-null float64\n",
|
||
" 60 PLD_Low 1718 non-null float64\n",
|
||
" 61 PLD_Trend 1718 non-null int64 \n",
|
||
" 62 RHO_PRICE 1718 non-null int64 \n",
|
||
" 63 USDI_Price 1718 non-null float64\n",
|
||
" 64 USDI_Open 1718 non-null float64\n",
|
||
" 65 USDI_High 1718 non-null float64\n",
|
||
" 66 USDI_Low 1718 non-null float64\n",
|
||
" 67 USDI_Volume 1718 non-null int64 \n",
|
||
" 68 USDI_Trend 1718 non-null int64 \n",
|
||
" 69 GDX_Open 1718 non-null float64\n",
|
||
" 70 GDX_High 1718 non-null float64\n",
|
||
" 71 GDX_Low 1718 non-null float64\n",
|
||
" 72 GDX_Close 1718 non-null float64\n",
|
||
" 73 GDX_Adj Close 1718 non-null float64\n",
|
||
" 74 GDX_Volume 1718 non-null int64 \n",
|
||
" 75 USO_Open 1718 non-null float64\n",
|
||
" 76 USO_High 1718 non-null float64\n",
|
||
" 77 USO_Low 1718 non-null float64\n",
|
||
" 78 USO_Close 1718 non-null float64\n",
|
||
" 79 USO_Adj Close 1718 non-null float64\n",
|
||
" 80 USO_Volume 1718 non-null int64 \n",
|
||
"dtypes: float64(58), int64(22), object(1)\n",
|
||
"memory usage: 1.1+ MB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"df = pd.read_csv(\"..\\\\static\\\\csv\\\\car_price_prediction.csv\")\n",
|
||
"df2 = pd.read_csv(\"..\\\\static\\\\csv\\\\Stores.csv\")\n",
|
||
"df3 = pd.read_csv(\"..\\\\static\\\\csv\\\\FINAL_USO.csv\")\n",
|
||
"df.info()\n",
|
||
"df2.info()\n",
|
||
"df3.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Проблемная область\n",
|
||
"Первый датасет позволяет проанализировать данные и понять, какие автомобили имеют превосходство на рынке и какие чаще всего выбирают пользователи.\n",
|
||
"Второй датасет позволяет при помощи данных о магазинах проанализировать их производительность и выявить факторы, влияющие на продажи.\n",
|
||
"Третий датасет позволяет проанализировать данные и спрогнозировать цены на золото на основе различных финансовых показателей.\n",
|
||
"#### Анализ набора данных\n",
|
||
"Объекты наблюдения - автомобили, магазины, цены на золото\n",
|
||
"Атрибуты - \n",
|
||
"1. ID, Цена, Налог, Производитель, Модель, Год производства, Категория, Кожаный салон, Тип топлива, Объем двигателя, Пробег, Цилиндры, Тип коробки передач, Приводные колеса, Количество дверей, Руль, Цвет, Подушки безопасности.\n",
|
||
"2. ID магазина, Площадь магазина, Доступные товары, Ежедневное количество покупателей, Продажи магазина.\n",
|
||
"3. Дата, Открытие, Максимум, Минимум, Закрытие, Скорректированное закрытие, Объем торгов, SP_открытие, SP_максимум, SP_минимум, SP_закрытие, SP_скорректированное закрытие, SP_объем, DJ_открытие, DJ_максимум, DJ_минимум, DJ_закрытие, DJ_скорректированное закрытие, DJ_объем, EG_открытие, EG_максимум, EG_минимум, EG_закрытие, EG_скорректированное закрытие, EG_объем, EU_Цена, EU_открытие, EU_максимум, EU_минимум, EU_тренд, OF_Цена, OF_Открытие, OF_Максимум, OF_Минимум, OF_Объем, OF_Тренд, OS_Цена, OS_Открытие, OS_Максимум, OS_Минимум, OS_Тренд, SF_Цена, SF_Открытие, SF_Максимум, SF_Минимум, SF_Объем, SF_Тренд, USB_Цена, USB_Открытие, USB_Максимум, USB_Минимум, USB_Тренд, PLT_Цена, PLT_Открытие, PLT_Максимум, PLT_Минимум, PLT_Тренд, PLD_Цена, PLD_Открытие, PLD_Максимум, PLD_Минимум, PLD_Тренд, RHO_Цена, USDI_Цена, USDI_Открытие, USDI_Максимум, USDI_Минимум, USDI_Объем, USDI_Тренд, GDX_Открытие, GDX_Максимум, GDX_Минимум, GDX_Закрытие, GDX_Скорректированное закрытие, GDX_Объем, USO_Открытие, USO_Максимум, USO_Минимум, USO_Закрытие, USO_Скорректированное закрытие, USO_Объем. Связи между объектами - нет\n",
|
||
"Связи между объектами - нет\n",
|
||
"#### Бизнес-цели\n",
|
||
"1. Какие модели автомобилей более выгодно продавать магазинам техники. Какие комплектующие наиболее популярны у производителей и покупателей, для дальнейшего увеличения производства данных комплектующих.\n",
|
||
"2. Анализ производительности магазинов для выявления факторов, влияющих на продажи, и оптимизация работы магазинов.\n",
|
||
"3. Прогноз цен на золото для принятия инвестиционных решений и управления рисками.\n",
|
||
"#### Примеры целей технического проекта. Что поступает на вход, что является целевым признаком.\n",
|
||
"На входе всегда датасет, целевые признаки:\n",
|
||
"1. Цена автомобиля (Price)\n",
|
||
"2. Продажи магазина (Store_Sales)\n",
|
||
"3. Цена закрытия золота (Close)\n",
|
||
"#### Проблемы набора данных и их решения\n",
|
||
"1. Возможны устаревшие данные. Для решения данной проблемы требуется удаление самых старых записей и добавление более новых.\n",
|
||
"2. Возможны выбросы. Решить эту проблему помогает удаление таких выбросов. В маленькой выборке не рекомендуется удалять выбросы. В большой выборке выбросы усредняются.\n",
|
||
"#### Качество набора данных\n",
|
||
"Наборы данных содержат достаточно примеров и признаков для обучения модели. Учтены различные ситуации проблемной области. Данные соответствуют данным, которые будут подаваться в производственной среде. Все метки согласованы.\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Поиск аномалий"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 45,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" ID Price Levy Prod. year Mileage \\\n",
|
||
"count 1.923700e+04 1.923700e+04 19237.000000 19237.000000 1.923700e+04 \n",
|
||
"mean 4.557654e+07 1.855593e+04 935.018662 2010.912824 1.532236e+06 \n",
|
||
"std 9.365914e+05 1.905813e+05 388.099990 5.668673 4.840387e+07 \n",
|
||
"min 2.074688e+07 1.000000e+00 87.000000 1939.000000 0.000000e+00 \n",
|
||
"25% 4.569837e+07 5.331000e+03 730.000000 2009.000000 7.013900e+04 \n",
|
||
"50% 4.577231e+07 1.317200e+04 1000.000000 2012.000000 1.260000e+05 \n",
|
||
"75% 4.580204e+07 2.207500e+04 1000.000000 2015.000000 1.888880e+05 \n",
|
||
"max 4.581665e+07 2.630750e+07 11714.000000 2020.000000 2.147484e+09 \n",
|
||
"\n",
|
||
" Cylinders Airbags \n",
|
||
"count 19237.000000 19237.000000 \n",
|
||
"mean 4.582991 6.582627 \n",
|
||
"std 1.199933 4.320168 \n",
|
||
"min 1.000000 0.000000 \n",
|
||
"25% 4.000000 4.000000 \n",
|
||
"50% 4.000000 6.000000 \n",
|
||
"75% 4.000000 12.000000 \n",
|
||
"max 16.000000 16.000000 \n",
|
||
" Store ID Store_Area Items_Available Daily_Customer_Count \\\n",
|
||
"count 896.000000 896.000000 896.000000 896.000000 \n",
|
||
"mean 448.500000 1485.409598 1782.035714 786.350446 \n",
|
||
"std 258.797218 250.237011 299.872053 265.389281 \n",
|
||
"min 1.000000 775.000000 932.000000 10.000000 \n",
|
||
"25% 224.750000 1316.750000 1575.500000 600.000000 \n",
|
||
"50% 448.500000 1477.000000 1773.500000 780.000000 \n",
|
||
"75% 672.250000 1653.500000 1982.750000 970.000000 \n",
|
||
"max 896.000000 2229.000000 2667.000000 1560.000000 \n",
|
||
"\n",
|
||
" Store_Sales \n",
|
||
"count 896.000000 \n",
|
||
"mean 59351.305804 \n",
|
||
"std 17190.741895 \n",
|
||
"min 14920.000000 \n",
|
||
"25% 46530.000000 \n",
|
||
"50% 58605.000000 \n",
|
||
"75% 71872.500000 \n",
|
||
"max 116320.000000 \n",
|
||
" Open High Low Close Adj Close \\\n",
|
||
"count 1718.000000 1718.000000 1718.000000 1718.000000 1718.000000 \n",
|
||
"mean 127.323434 127.854237 126.777695 127.319482 127.319482 \n",
|
||
"std 17.526993 17.631189 17.396513 17.536269 17.536269 \n",
|
||
"min 100.919998 100.989998 100.230003 100.500000 100.500000 \n",
|
||
"25% 116.220001 116.540001 115.739998 116.052502 116.052502 \n",
|
||
"50% 121.915001 122.325001 121.369999 121.795002 121.795002 \n",
|
||
"75% 128.427494 129.087498 127.840001 128.470001 128.470001 \n",
|
||
"max 173.199997 174.070007 172.919998 173.610001 173.610001 \n",
|
||
"\n",
|
||
" Volume SP_open SP_high SP_low SP_close ... \\\n",
|
||
"count 1.718000e+03 1718.000000 1718.000000 1718.000000 1718.000000 ... \n",
|
||
"mean 8.446327e+06 204.490023 205.372637 203.487014 204.491222 ... \n",
|
||
"std 4.920731e+06 43.831928 43.974644 43.618940 43.776999 ... \n",
|
||
"min 1.501600e+06 122.059998 122.320000 120.029999 120.290001 ... \n",
|
||
"25% 5.412925e+06 170.392498 170.962506 169.577499 170.397500 ... \n",
|
||
"50% 7.483900e+06 205.464996 206.459999 204.430000 205.529999 ... \n",
|
||
"75% 1.020795e+07 237.292500 237.722500 236.147503 236.889996 ... \n",
|
||
"max 9.380420e+07 293.089996 293.940002 291.809998 293.579987 ... \n",
|
||
"\n",
|
||
" GDX_Low GDX_Close GDX_Adj Close GDX_Volume USO_Open \\\n",
|
||
"count 1718.000000 1718.000000 1718.000000 1.718000e+03 1718.000000 \n",
|
||
"mean 26.384575 26.715012 25.924624 4.356515e+07 22.113417 \n",
|
||
"std 10.490908 10.603110 9.886570 2.909151e+07 11.431056 \n",
|
||
"min 12.400000 12.470000 12.269618 4.729000e+06 7.820000 \n",
|
||
"25% 20.355000 20.585000 20.180950 2.259968e+07 11.420000 \n",
|
||
"50% 22.870001 23.054999 22.677604 3.730465e+07 16.450000 \n",
|
||
"75% 26.797500 27.317500 26.478154 5.697055e+07 34.419998 \n",
|
||
"max 56.770000 57.470001 54.617039 2.321536e+08 41.599998 \n",
|
||
"\n",
|
||
" USO_High USO_Low USO_Close USO_Adj Close USO_Volume \n",
|
||
"count 1718.000000 1718.000000 1718.000000 1718.000000 1.718000e+03 \n",
|
||
"mean 22.307148 21.904657 22.109051 22.109051 1.922313e+07 \n",
|
||
"std 11.478671 11.373997 11.432787 11.432787 1.575743e+07 \n",
|
||
"min 8.030000 7.670000 7.960000 7.960000 1.035100e+06 \n",
|
||
"25% 11.500000 11.300000 11.392500 11.392500 6.229500e+06 \n",
|
||
"50% 16.635001 16.040000 16.345000 16.345000 1.613015e+07 \n",
|
||
"75% 34.667499 34.110000 34.417499 34.417499 2.672375e+07 \n",
|
||
"max 42.299999 41.299999 42.009998 42.009998 1.102657e+08 \n",
|
||
"\n",
|
||
"[8 rows x 80 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(df.describe())\n",
|
||
"print(df2.describe())\n",
|
||
"print(df3.describe())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"При просмотре вывода не было замечено аномалий в столбцах датасетов."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Проблема пропущенных данных"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 46,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"DATASET 1\n",
|
||
"DATASET 2\n",
|
||
"DATASET 3\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(\"DATASET 1\")\n",
|
||
"for i in df.columns:\n",
|
||
" null_rate = df[i].isnull().sum() / len(df)*100\n",
|
||
" if null_rate > 0:\n",
|
||
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n",
|
||
"print(\"DATASET 2\")\n",
|
||
"for i in df2.columns:\n",
|
||
" null_rate = df2[i].isnull().sum() / len(df2)*100\n",
|
||
" if null_rate > 0:\n",
|
||
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n",
|
||
"print(\"DATASET 3\")\n",
|
||
"for i in df3.columns:\n",
|
||
" null_rate = df3[i].isnull().sum() / len(df3)*100\n",
|
||
" if null_rate > 0:\n",
|
||
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Во всех датасетах пустых значений не найдено."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Разбиение на выборки\n",
|
||
"Для разбиения на выборке для начала уменьшим количество уникальных значений в столбцах с целевыми признаками, путем добавления новых столбцов с малым количеством уникальных значений."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 47,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"C:\\Users\\goldfest\\AppData\\Local\\Temp\\ipykernel_12052\\2802807477.py:9: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" df3_filtered['new_price'] = pd.cut(df3_filtered['Open'], bins=[100, 120, 130, 140, 160, 180], labels=[1, 2, 3, 4, 5], include_lowest=True)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Добавление нового столбца для первого датасета с рейтингом от 1 до 5\n",
|
||
"df['new_rating'] = pd.cut(df['Price'], bins=[0, 40000, 80000, 120000, 180000, 50000000], labels=[1, 2, 3, 4, 5], include_lowest=True)\n",
|
||
"\n",
|
||
"# Добавление нового столбца для второго датасета с диапазоном цен от 1 до 5\n",
|
||
"df2['new_high'] = pd.cut(df2['Store_Sales'], bins=[0, 25000, 50000, 75000, 100000, 127000], labels=[1, 2, 3, 4, 5], include_lowest=True)\n",
|
||
"\n",
|
||
"# Фильтрация третьего датасета по цене и добавление категории цен от 1 до 5\n",
|
||
"df3_filtered = df3[(df3['Open'] >= 100) & (df3['Open'] <= 160)]\n",
|
||
"df3_filtered['new_price'] = pd.cut(df3_filtered['Open'], bins=[100, 120, 130, 140, 160, 180], labels=[1, 2, 3, 4, 5], include_lowest=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 48,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"\n",
|
||
"def split_stratified_into_train_val_test(\n",
|
||
" df_input,\n",
|
||
" stratify_colname=\"y\",\n",
|
||
" frac_train=0.6,\n",
|
||
" frac_val=0.15,\n",
|
||
" frac_test=0.25,\n",
|
||
" random_state=None,\n",
|
||
"):\n",
|
||
" if frac_train + frac_val + frac_test != 1.0:\n",
|
||
" raise ValueError(\n",
|
||
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
||
" % (frac_train, frac_val, frac_test)\n",
|
||
" )\n",
|
||
"\n",
|
||
" if stratify_colname not in df_input.columns:\n",
|
||
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
||
"\n",
|
||
" X = df_input \n",
|
||
" y = df_input[\n",
|
||
" [stratify_colname]\n",
|
||
" ] \n",
|
||
"\n",
|
||
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
||
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
||
" )\n",
|
||
"\n",
|
||
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
||
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
||
" df_temp,\n",
|
||
" y_temp,\n",
|
||
" stratify=y_temp,\n",
|
||
" test_size=relative_frac_test,\n",
|
||
" random_state=random_state,\n",
|
||
" )\n",
|
||
"\n",
|
||
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
||
"\n",
|
||
" return df_train, df_val, df_test"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Выборки датасетов"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 49,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"DATASET 1\n",
|
||
"Train: (11542, 19), Val: (3847, 19), Test: (3848, 19)\n",
|
||
"DATASET 2\n",
|
||
"Train: (537, 6), Val: (179, 6), Test: (180, 6)\n",
|
||
"DATASET 3\n",
|
||
"Train: (929, 82), Val: (310, 82), Test: (310, 82)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Разбиение на выборки для каждого датасета\n",
|
||
"df_train1, df_val1, df_test1 = split_stratified_into_train_val_test(\n",
|
||
" df, stratify_colname=\"new_rating\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
|
||
")\n",
|
||
"\n",
|
||
"df_train2, df_val2, df_test2 = split_stratified_into_train_val_test(\n",
|
||
" df2, stratify_colname=\"new_high\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
|
||
")\n",
|
||
"\n",
|
||
"df_train3, df_val3, df_test3 = split_stratified_into_train_val_test(\n",
|
||
" df3_filtered, stratify_colname=\"new_price\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
|
||
")\n",
|
||
"\n",
|
||
"# Проверка размеров выборок\n",
|
||
"print(\"DATASET 1\")\n",
|
||
"print(f\"Train: {df_train1.shape}, Val: {df_val1.shape}, Test: {df_test1.shape}\")\n",
|
||
"\n",
|
||
"print(\"DATASET 2\")\n",
|
||
"print(f\"Train: {df_train2.shape}, Val: {df_val2.shape}, Test: {df_test2.shape}\")\n",
|
||
"\n",
|
||
"print(\"DATASET 3\")\n",
|
||
"print(f\"Train: {df_train3.shape}, Val: {df_val3.shape}, Test: {df_test3.shape}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Было сделано разбиение на три выборки: 60%, 20% и 20% при помощи библиотеки scikit-learn и функции train_test_split. На взгляд сбалансированные\n",
|
||
"### Приращение методами выборки с избытком (oversampling) и выборки с недостатком (undersampling)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 50,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Выборка до oversampling и undersampling (датасет 1): (11542, 6)\n",
|
||
"new_rating\n",
|
||
"1 10485\n",
|
||
"2 919\n",
|
||
"3 102\n",
|
||
"4 27\n",
|
||
"5 9\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Выборка после oversampling (датасет 1): (52450, 6)\n",
|
||
"new_rating\n",
|
||
"2 10509\n",
|
||
"3 10490\n",
|
||
"1 10485\n",
|
||
"5 10484\n",
|
||
"4 10482\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Выборка после undersampling (датасет 1): (45, 6)\n",
|
||
"new_rating\n",
|
||
"1 9\n",
|
||
"2 9\n",
|
||
"3 9\n",
|
||
"4 9\n",
|
||
"5 9\n",
|
||
"Name: count, dtype: int64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from imblearn.over_sampling import ADASYN\n",
|
||
"from imblearn.under_sampling import RandomUnderSampler\n",
|
||
"\n",
|
||
"df_train1 = df_train1[['Price', 'Levy', 'Mileage', 'Prod. year', 'Mileage', 'new_rating']].copy()\n",
|
||
"\n",
|
||
"ada = ADASYN()\n",
|
||
"undersampler = RandomUnderSampler(random_state=42)\n",
|
||
"\n",
|
||
"print(\"Выборка до oversampling и undersampling (датасет 1):\", df_train1.shape)\n",
|
||
"print(df_train1['new_rating'].value_counts())\n",
|
||
"\n",
|
||
"X_resampled, y_resampled = ada.fit_resample(df_train1, df_train1['new_rating'])\n",
|
||
"df_train1_adasyn = pd.DataFrame(X_resampled)\n",
|
||
"\n",
|
||
"print(\"Выборка после oversampling (датасет 1): \", df_train1_adasyn.shape)\n",
|
||
"print(df_train1_adasyn.new_rating.value_counts())\n",
|
||
"\n",
|
||
"X_resampled_under, y_resampled_under = undersampler.fit_resample(df_train1, df_train1['new_rating'])\n",
|
||
"\n",
|
||
"print(\"Выборка после undersampling (датасет 1): \", pd.DataFrame(X_resampled_under).shape)\n",
|
||
"print(pd.DataFrame(X_resampled_under)['new_rating'].value_counts())\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 51,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Выборка до oversampling и undersampling: (537, 5)\n",
|
||
"new_high\n",
|
||
"3 253\n",
|
||
"2 167\n",
|
||
"4 105\n",
|
||
"1 8\n",
|
||
"5 4\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Выборка после oversampling: (1265, 5)\n",
|
||
"new_high\n",
|
||
"1 253\n",
|
||
"2 253\n",
|
||
"3 253\n",
|
||
"4 253\n",
|
||
"5 253\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Выборка после undersampling: (20, 5)\n",
|
||
"new_high\n",
|
||
"1 4\n",
|
||
"2 4\n",
|
||
"3 4\n",
|
||
"4 4\n",
|
||
"5 4\n",
|
||
"Name: count, dtype: int64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from imblearn.over_sampling import SMOTE\n",
|
||
"df_train2 = df_train2[['Store_Sales', 'Store_Area', 'Items_Available', 'Daily_Customer_Count', 'new_high']].copy()\n",
|
||
"\n",
|
||
"smote = SMOTE(random_state=42, k_neighbors=2)\n",
|
||
"undersampler = RandomUnderSampler(random_state=42)\n",
|
||
"\n",
|
||
"print(\"Выборка до oversampling и undersampling:\", df_train2.shape)\n",
|
||
"print(df_train2['new_high'].value_counts())\n",
|
||
"\n",
|
||
"X_resampled, y_resampled = smote.fit_resample(df_train2, df_train2['new_high'])\n",
|
||
"df_train2_smote = pd.DataFrame(X_resampled, columns=df_train2.columns)\n",
|
||
"\n",
|
||
"print(\"Выборка после oversampling:\", df_train2_smote.shape)\n",
|
||
"print(df_train2_smote['new_high'].value_counts())\n",
|
||
"\n",
|
||
"X_resampled_under2, y_resampled_under2 = undersampler.fit_resample(df_train2, df_train2['new_high'])\n",
|
||
"df_train2_under = pd.DataFrame(X_resampled_under2, columns=df_train2.columns)\n",
|
||
"\n",
|
||
"print(\"Выборка после undersampling:\", df_train2_under.shape)\n",
|
||
"print(df_train2_under['new_high'].value_counts())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 52,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Выборка до oversampling и undersampling (датасет 3): (929, 6)\n",
|
||
"new_price\n",
|
||
"1 428\n",
|
||
"2 366\n",
|
||
"4 98\n",
|
||
"3 37\n",
|
||
"5 0\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Выборка после oversampling (датасет 3): (1712, 6)\n",
|
||
"new_price\n",
|
||
"1 428\n",
|
||
"2 428\n",
|
||
"3 428\n",
|
||
"4 428\n",
|
||
"5 0\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Выборка после undersampling (датасет 3): (148, 6)\n",
|
||
"new_price\n",
|
||
"1 37\n",
|
||
"2 37\n",
|
||
"3 37\n",
|
||
"4 37\n",
|
||
"5 0\n",
|
||
"Name: count, dtype: int64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from imblearn.over_sampling import RandomOverSampler\n",
|
||
"\n",
|
||
"df_train3 = df_train3[['Open', 'High', 'Low', 'Close', 'Volume', 'new_price']].copy()\n",
|
||
"\n",
|
||
"oversampler = RandomOverSampler(random_state=42)\n",
|
||
"undersampler = RandomUnderSampler(random_state=42)\n",
|
||
"\n",
|
||
"print(\"Выборка до oversampling и undersampling (датасет 3):\", df_train3.shape)\n",
|
||
"print(df_train3['new_price'].value_counts())\n",
|
||
"\n",
|
||
"X_resampled, y_resampled = oversampler.fit_resample(df_train3, df_train3['new_price'])\n",
|
||
"df_train3_oversampled = pd.DataFrame(X_resampled, columns=df_train3.columns)\n",
|
||
"\n",
|
||
"print(\"Выборка после oversampling (датасет 3):\", df_train3_oversampled.shape)\n",
|
||
"print(df_train3_oversampled['new_price'].value_counts())\n",
|
||
"\n",
|
||
"X_resampled_under, y_resampled_under = undersampler.fit_resample(df_train3, df_train3['new_price'])\n",
|
||
"df_train3_under = pd.DataFrame(X_resampled_under, columns=df_train3.columns)\n",
|
||
"\n",
|
||
"print(\"Выборка после undersampling (датасет 3):\", df_train3_under.shape)\n",
|
||
"print(df_train3_under['new_price'].value_counts())\n"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|