AIM-PIbd-31-LOBASHOV-I-D/lab_2/lab_2.ipynb

697 lines
33 KiB
Plaintext
Raw Permalink Normal View History

2024-10-26 01:23:18 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Lab2 PIbd-31 Lobashov\n",
"Три датасета:\n",
"1. Цена на автомобили (17 вариант)\n",
"2. Магазины (9 вариант)\n",
"3. Цены на золото (14 вариант)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 19237 entries, 0 to 19236\n",
"Data columns (total 18 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 ID 19237 non-null int64 \n",
" 1 Price 19237 non-null int64 \n",
" 2 Levy 19237 non-null int64 \n",
" 3 Manufacturer 19237 non-null object \n",
" 4 Model 19237 non-null object \n",
" 5 Prod. year 19237 non-null int64 \n",
" 6 Category 19237 non-null object \n",
" 7 Leather interior 19237 non-null object \n",
" 8 Fuel type 19237 non-null object \n",
" 9 Engine volume 19237 non-null object \n",
" 10 Mileage 19237 non-null int64 \n",
" 11 Cylinders 19237 non-null float64\n",
" 12 Gear box type 19237 non-null object \n",
" 13 Drive wheels 19237 non-null object \n",
" 14 Doors 19237 non-null object \n",
" 15 Wheel 19237 non-null object \n",
" 16 Color 19237 non-null object \n",
" 17 Airbags 19237 non-null int64 \n",
"dtypes: float64(1), int64(6), object(11)\n",
"memory usage: 2.6+ MB\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 896 entries, 0 to 895\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype\n",
"--- ------ -------------- -----\n",
" 0 Store ID 896 non-null int64\n",
" 1 Store_Area 896 non-null int64\n",
" 2 Items_Available 896 non-null int64\n",
" 3 Daily_Customer_Count 896 non-null int64\n",
" 4 Store_Sales 896 non-null int64\n",
"dtypes: int64(5)\n",
"memory usage: 35.1 KB\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1718 entries, 0 to 1717\n",
"Data columns (total 81 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Date 1718 non-null object \n",
" 1 Open 1718 non-null float64\n",
" 2 High 1718 non-null float64\n",
" 3 Low 1718 non-null float64\n",
" 4 Close 1718 non-null float64\n",
" 5 Adj Close 1718 non-null float64\n",
" 6 Volume 1718 non-null int64 \n",
" 7 SP_open 1718 non-null float64\n",
" 8 SP_high 1718 non-null float64\n",
" 9 SP_low 1718 non-null float64\n",
" 10 SP_close 1718 non-null float64\n",
" 11 SP_Ajclose 1718 non-null float64\n",
" 12 SP_volume 1718 non-null int64 \n",
" 13 DJ_open 1718 non-null float64\n",
" 14 DJ_high 1718 non-null float64\n",
" 15 DJ_low 1718 non-null float64\n",
" 16 DJ_close 1718 non-null float64\n",
" 17 DJ_Ajclose 1718 non-null float64\n",
" 18 DJ_volume 1718 non-null int64 \n",
" 19 EG_open 1718 non-null float64\n",
" 20 EG_high 1718 non-null float64\n",
" 21 EG_low 1718 non-null float64\n",
" 22 EG_close 1718 non-null float64\n",
" 23 EG_Ajclose 1718 non-null float64\n",
" 24 EG_volume 1718 non-null int64 \n",
" 25 EU_Price 1718 non-null float64\n",
" 26 EU_open 1718 non-null float64\n",
" 27 EU_high 1718 non-null float64\n",
" 28 EU_low 1718 non-null float64\n",
" 29 EU_Trend 1718 non-null int64 \n",
" 30 OF_Price 1718 non-null float64\n",
" 31 OF_Open 1718 non-null float64\n",
" 32 OF_High 1718 non-null float64\n",
" 33 OF_Low 1718 non-null float64\n",
" 34 OF_Volume 1718 non-null int64 \n",
" 35 OF_Trend 1718 non-null int64 \n",
" 36 OS_Price 1718 non-null float64\n",
" 37 OS_Open 1718 non-null float64\n",
" 38 OS_High 1718 non-null float64\n",
" 39 OS_Low 1718 non-null float64\n",
" 40 OS_Trend 1718 non-null int64 \n",
" 41 SF_Price 1718 non-null int64 \n",
" 42 SF_Open 1718 non-null int64 \n",
" 43 SF_High 1718 non-null int64 \n",
" 44 SF_Low 1718 non-null int64 \n",
" 45 SF_Volume 1718 non-null int64 \n",
" 46 SF_Trend 1718 non-null int64 \n",
" 47 USB_Price 1718 non-null float64\n",
" 48 USB_Open 1718 non-null float64\n",
" 49 USB_High 1718 non-null float64\n",
" 50 USB_Low 1718 non-null float64\n",
" 51 USB_Trend 1718 non-null int64 \n",
" 52 PLT_Price 1718 non-null float64\n",
" 53 PLT_Open 1718 non-null float64\n",
" 54 PLT_High 1718 non-null float64\n",
" 55 PLT_Low 1718 non-null float64\n",
" 56 PLT_Trend 1718 non-null int64 \n",
" 57 PLD_Price 1718 non-null float64\n",
" 58 PLD_Open 1718 non-null float64\n",
" 59 PLD_High 1718 non-null float64\n",
" 60 PLD_Low 1718 non-null float64\n",
" 61 PLD_Trend 1718 non-null int64 \n",
" 62 RHO_PRICE 1718 non-null int64 \n",
" 63 USDI_Price 1718 non-null float64\n",
" 64 USDI_Open 1718 non-null float64\n",
" 65 USDI_High 1718 non-null float64\n",
" 66 USDI_Low 1718 non-null float64\n",
" 67 USDI_Volume 1718 non-null int64 \n",
" 68 USDI_Trend 1718 non-null int64 \n",
" 69 GDX_Open 1718 non-null float64\n",
" 70 GDX_High 1718 non-null float64\n",
" 71 GDX_Low 1718 non-null float64\n",
" 72 GDX_Close 1718 non-null float64\n",
" 73 GDX_Adj Close 1718 non-null float64\n",
" 74 GDX_Volume 1718 non-null int64 \n",
" 75 USO_Open 1718 non-null float64\n",
" 76 USO_High 1718 non-null float64\n",
" 77 USO_Low 1718 non-null float64\n",
" 78 USO_Close 1718 non-null float64\n",
" 79 USO_Adj Close 1718 non-null float64\n",
" 80 USO_Volume 1718 non-null int64 \n",
"dtypes: float64(58), int64(22), object(1)\n",
"memory usage: 1.1+ MB\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv(\"..\\\\static\\\\csv\\\\car_price_prediction.csv\")\n",
"df2 = pd.read_csv(\"..\\\\static\\\\csv\\\\Stores.csv\")\n",
"df3 = pd.read_csv(\"..\\\\static\\\\csv\\\\FINAL_USO.csv\")\n",
"df.info()\n",
"df2.info()\n",
"df3.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Проблемная область\n",
"Первый датасет позволяет проанализировать данные и понять, какие автомобили имеют превосходство на рынке и какие чаще всего выбирают пользователи.\n",
"Второй датасет позволяет при помощи данных о магазинах проанализировать их производительность и выявить факторы, влияющие на продажи.\n",
"Третий датасет позволяет проанализировать данные и спрогнозировать цены на золото на основе различных финансовых показателей.\n",
"#### Анализ набора данных\n",
"Объекты наблюдения - автомобили, магазины, цены на золото\n",
"Атрибуты - \n",
"1. ID, Цена, Налог, Производитель, Модель, Год производства, Категория, Кожаный салон, Тип топлива, Объем двигателя, Пробег, Цилиндры, Тип коробки передач, Приводные колеса, Количество дверей, Руль, Цвет, Подушки безопасности.\n",
"2. ID магазина, Площадь магазина, Доступные товары, Ежедневное количество покупателей, Продажи магазина.\n",
"3. Дата, Открытие, Максимум, Минимум, Закрытие, Скорректированное закрытие, Объем торгов, SP_открытие, SP_максимум, SP_минимум, SP_закрытие, SP_скорректированное закрытие, SP_объем, DJ_открытие, DJ_максимум, DJ_минимум, DJ_закрытие, DJ_скорректированное закрытие, DJ_объем, EG_открытие, EG_максимум, EG_минимум, EG_закрытие, EG_скорректированное закрытие, EG_объем, EU_Цена, EU_открытие, EU_максимум, EU_минимум, EU_тренд, OF_Цена, OF_Открытие, OF_Максимум, OF_Минимум, OF_Объем, OF_Тренд, OS_Цена, OS_Открытие, OS_Максимум, OS_Минимум, OS_Тренд, SF_Цена, SF_Открытие, SF_Максимум, SF_Минимум, SF_Объем, SF_Тренд, USB_Цена, USB_Открытие, USB_Максимум, USB_Минимум, USB_Тренд, PLT_Цена, PLT_Открытие, PLT_Максимум, PLT_Минимум, PLT_Тренд, PLD_Цена, PLD_Открытие, PLD_Максимум, PLD_Минимум, PLD_Тренд, RHO_Цена, USDI_Цена, USDI_Открытие, USDI_Максимум, USDI_Минимум, USDI_Объем, USDI_Тренд, GDX_Открытие, GDX_Максимум, GDX_Минимум, GDX_Закрытие, GDX_Скорректированное закрытие, GDX_Объем, USO_Открытие, USO_Максимум, USO_Минимум, USO_Закрытие, USO_Скорректированное закрытие, USO_Объем. Связи между объектами - нет\n",
"Связи между объектами - нет\n",
"#### Бизнес-цели\n",
"1. Какие модели автомобилей более выгодно продавать магазинам техники. Какие комплектующие наиболее популярны у производителей и покупателей, для дальнейшего увеличения производства данных комплектующих.\n",
"2. Анализ производительности магазинов для выявления факторов, влияющих на продажи, и оптимизация работы магазинов.\n",
"3. Прогноз цен на золото для принятия инвестиционных решений и управления рисками.\n",
"#### Примеры целей технического проекта. Что поступает на вход, что является целевым признаком.\n",
"На входе всегда датасет, целевые признаки:\n",
"1. Цена автомобиля (Price)\n",
"2. Продажи магазина (Store_Sales)\n",
"3. Цена закрытия золота (Close)\n",
"#### Проблемы набора данных и их решения\n",
"1. Возможны устаревшие данные. Для решения данной проблемы требуется удаление самых старых записей и добавление более новых.\n",
"2. Возможны выбросы. Решить эту проблему помогает удаление таких выбросов. В маленькой выборке не рекомендуется удалять выбросы. В большой выборке выбросы усредняются.\n",
"#### Качество набора данных\n",
"Наборы данных содержат достаточно примеров и признаков для обучения модели. Учтены различные ситуации проблемной области. Данные соответствуют данным, которые будут подаваться в производственной среде. Все метки согласованы.\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Поиск аномалий"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ID Price Levy Prod. year Mileage \\\n",
"count 1.923700e+04 1.923700e+04 19237.000000 19237.000000 1.923700e+04 \n",
"mean 4.557654e+07 1.855593e+04 935.018662 2010.912824 1.532236e+06 \n",
"std 9.365914e+05 1.905813e+05 388.099990 5.668673 4.840387e+07 \n",
"min 2.074688e+07 1.000000e+00 87.000000 1939.000000 0.000000e+00 \n",
"25% 4.569837e+07 5.331000e+03 730.000000 2009.000000 7.013900e+04 \n",
"50% 4.577231e+07 1.317200e+04 1000.000000 2012.000000 1.260000e+05 \n",
"75% 4.580204e+07 2.207500e+04 1000.000000 2015.000000 1.888880e+05 \n",
"max 4.581665e+07 2.630750e+07 11714.000000 2020.000000 2.147484e+09 \n",
"\n",
" Cylinders Airbags \n",
"count 19237.000000 19237.000000 \n",
"mean 4.582991 6.582627 \n",
"std 1.199933 4.320168 \n",
"min 1.000000 0.000000 \n",
"25% 4.000000 4.000000 \n",
"50% 4.000000 6.000000 \n",
"75% 4.000000 12.000000 \n",
"max 16.000000 16.000000 \n",
" Store ID Store_Area Items_Available Daily_Customer_Count \\\n",
"count 896.000000 896.000000 896.000000 896.000000 \n",
"mean 448.500000 1485.409598 1782.035714 786.350446 \n",
"std 258.797218 250.237011 299.872053 265.389281 \n",
"min 1.000000 775.000000 932.000000 10.000000 \n",
"25% 224.750000 1316.750000 1575.500000 600.000000 \n",
"50% 448.500000 1477.000000 1773.500000 780.000000 \n",
"75% 672.250000 1653.500000 1982.750000 970.000000 \n",
"max 896.000000 2229.000000 2667.000000 1560.000000 \n",
"\n",
" Store_Sales \n",
"count 896.000000 \n",
"mean 59351.305804 \n",
"std 17190.741895 \n",
"min 14920.000000 \n",
"25% 46530.000000 \n",
"50% 58605.000000 \n",
"75% 71872.500000 \n",
"max 116320.000000 \n",
" Open High Low Close Adj Close \\\n",
"count 1718.000000 1718.000000 1718.000000 1718.000000 1718.000000 \n",
"mean 127.323434 127.854237 126.777695 127.319482 127.319482 \n",
"std 17.526993 17.631189 17.396513 17.536269 17.536269 \n",
"min 100.919998 100.989998 100.230003 100.500000 100.500000 \n",
"25% 116.220001 116.540001 115.739998 116.052502 116.052502 \n",
"50% 121.915001 122.325001 121.369999 121.795002 121.795002 \n",
"75% 128.427494 129.087498 127.840001 128.470001 128.470001 \n",
"max 173.199997 174.070007 172.919998 173.610001 173.610001 \n",
"\n",
" Volume SP_open SP_high SP_low SP_close ... \\\n",
"count 1.718000e+03 1718.000000 1718.000000 1718.000000 1718.000000 ... \n",
"mean 8.446327e+06 204.490023 205.372637 203.487014 204.491222 ... \n",
"std 4.920731e+06 43.831928 43.974644 43.618940 43.776999 ... \n",
"min 1.501600e+06 122.059998 122.320000 120.029999 120.290001 ... \n",
"25% 5.412925e+06 170.392498 170.962506 169.577499 170.397500 ... \n",
"50% 7.483900e+06 205.464996 206.459999 204.430000 205.529999 ... \n",
"75% 1.020795e+07 237.292500 237.722500 236.147503 236.889996 ... \n",
"max 9.380420e+07 293.089996 293.940002 291.809998 293.579987 ... \n",
"\n",
" GDX_Low GDX_Close GDX_Adj Close GDX_Volume USO_Open \\\n",
"count 1718.000000 1718.000000 1718.000000 1.718000e+03 1718.000000 \n",
"mean 26.384575 26.715012 25.924624 4.356515e+07 22.113417 \n",
"std 10.490908 10.603110 9.886570 2.909151e+07 11.431056 \n",
"min 12.400000 12.470000 12.269618 4.729000e+06 7.820000 \n",
"25% 20.355000 20.585000 20.180950 2.259968e+07 11.420000 \n",
"50% 22.870001 23.054999 22.677604 3.730465e+07 16.450000 \n",
"75% 26.797500 27.317500 26.478154 5.697055e+07 34.419998 \n",
"max 56.770000 57.470001 54.617039 2.321536e+08 41.599998 \n",
"\n",
" USO_High USO_Low USO_Close USO_Adj Close USO_Volume \n",
"count 1718.000000 1718.000000 1718.000000 1718.000000 1.718000e+03 \n",
"mean 22.307148 21.904657 22.109051 22.109051 1.922313e+07 \n",
"std 11.478671 11.373997 11.432787 11.432787 1.575743e+07 \n",
"min 8.030000 7.670000 7.960000 7.960000 1.035100e+06 \n",
"25% 11.500000 11.300000 11.392500 11.392500 6.229500e+06 \n",
"50% 16.635001 16.040000 16.345000 16.345000 1.613015e+07 \n",
"75% 34.667499 34.110000 34.417499 34.417499 2.672375e+07 \n",
"max 42.299999 41.299999 42.009998 42.009998 1.102657e+08 \n",
"\n",
"[8 rows x 80 columns]\n"
]
}
],
"source": [
"print(df.describe())\n",
"print(df2.describe())\n",
"print(df3.describe())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"При просмотре вывода не было замечено аномалий в столбцах датасетов."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Проблема пропущенных данных"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DATASET 1\n",
"DATASET 2\n",
"DATASET 3\n"
]
}
],
"source": [
"print(\"DATASET 1\")\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df)*100\n",
" if null_rate > 0:\n",
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n",
"print(\"DATASET 2\")\n",
"for i in df2.columns:\n",
" null_rate = df2[i].isnull().sum() / len(df2)*100\n",
" if null_rate > 0:\n",
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n",
"print(\"DATASET 3\")\n",
"for i in df3.columns:\n",
" null_rate = df3[i].isnull().sum() / len(df3)*100\n",
" if null_rate > 0:\n",
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Во всех датасетах пустых значений не найдено."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Разбиение на выборки\n",
"Для разбиения на выборке для начала уменьшим количество уникальных значений в столбцах с целевыми признаками, путем добавления новых столбцов с малым количеством уникальных значений."
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\goldfest\\AppData\\Local\\Temp\\ipykernel_12052\\2802807477.py:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df3_filtered['new_price'] = pd.cut(df3_filtered['Open'], bins=[100, 120, 130, 140, 160, 180], labels=[1, 2, 3, 4, 5], include_lowest=True)\n"
]
}
],
"source": [
"# Добавление нового столбца для первого датасета с рейтингом от 1 до 5\n",
"df['new_rating'] = pd.cut(df['Price'], bins=[0, 40000, 80000, 120000, 180000, 50000000], labels=[1, 2, 3, 4, 5], include_lowest=True)\n",
"\n",
"# Добавление нового столбца для второго датасета с диапазоном цен от 1 до 5\n",
"df2['new_high'] = pd.cut(df2['Store_Sales'], bins=[0, 25000, 50000, 75000, 100000, 127000], labels=[1, 2, 3, 4, 5], include_lowest=True)\n",
"\n",
"# Фильтрация третьего датасета по цене и добавление категории цен от 1 до 5\n",
"df3_filtered = df3[(df3['Open'] >= 100) & (df3['Open'] <= 160)]\n",
"df3_filtered['new_price'] = pd.cut(df3_filtered['Open'], bins=[100, 120, 130, 140, 160, 180], labels=[1, 2, 3, 4, 5], include_lowest=True)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"def split_stratified_into_train_val_test(\n",
" df_input,\n",
" stratify_colname=\"y\",\n",
" frac_train=0.6,\n",
" frac_val=0.15,\n",
" frac_test=0.25,\n",
" random_state=None,\n",
"):\n",
" if frac_train + frac_val + frac_test != 1.0:\n",
" raise ValueError(\n",
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
" % (frac_train, frac_val, frac_test)\n",
" )\n",
"\n",
" if stratify_colname not in df_input.columns:\n",
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
"\n",
" X = df_input \n",
" y = df_input[\n",
" [stratify_colname]\n",
" ] \n",
"\n",
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
" )\n",
"\n",
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
" df_val, df_test, y_val, y_test = train_test_split(\n",
" df_temp,\n",
" y_temp,\n",
" stratify=y_temp,\n",
" test_size=relative_frac_test,\n",
" random_state=random_state,\n",
" )\n",
"\n",
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
"\n",
" return df_train, df_val, df_test"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Выборки датасетов"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DATASET 1\n",
"Train: (11542, 19), Val: (3847, 19), Test: (3848, 19)\n",
"DATASET 2\n",
"Train: (537, 6), Val: (179, 6), Test: (180, 6)\n",
"DATASET 3\n",
"Train: (929, 82), Val: (310, 82), Test: (310, 82)\n"
]
}
],
"source": [
"# Разбиение на выборки для каждого датасета\n",
"df_train1, df_val1, df_test1 = split_stratified_into_train_val_test(\n",
" df, stratify_colname=\"new_rating\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
")\n",
"\n",
"df_train2, df_val2, df_test2 = split_stratified_into_train_val_test(\n",
" df2, stratify_colname=\"new_high\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
")\n",
"\n",
"df_train3, df_val3, df_test3 = split_stratified_into_train_val_test(\n",
" df3_filtered, stratify_colname=\"new_price\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
")\n",
"\n",
"# Проверка размеров выборок\n",
"print(\"DATASET 1\")\n",
"print(f\"Train: {df_train1.shape}, Val: {df_val1.shape}, Test: {df_test1.shape}\")\n",
"\n",
"print(\"DATASET 2\")\n",
"print(f\"Train: {df_train2.shape}, Val: {df_val2.shape}, Test: {df_test2.shape}\")\n",
"\n",
"print(\"DATASET 3\")\n",
"print(f\"Train: {df_train3.shape}, Val: {df_val3.shape}, Test: {df_test3.shape}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Было сделано разбиение на три выборки: 60%, 20% и 20% при помощи библиотеки scikit-learn и функции train_test_split. На взгляд сбалансированные\n",
"### Приращение методами выборки с избытком (oversampling) и выборки с недостатком (undersampling)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Выборка до oversampling и undersampling (датасет 1): (11542, 6)\n",
"new_rating\n",
"1 10485\n",
"2 919\n",
"3 102\n",
"4 27\n",
"5 9\n",
"Name: count, dtype: int64\n",
"Выборка после oversampling (датасет 1): (52450, 6)\n",
"new_rating\n",
"2 10509\n",
"3 10490\n",
"1 10485\n",
"5 10484\n",
"4 10482\n",
"Name: count, dtype: int64\n",
"Выборка после undersampling (датасет 1): (45, 6)\n",
"new_rating\n",
"1 9\n",
"2 9\n",
"3 9\n",
"4 9\n",
"5 9\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"from imblearn.over_sampling import ADASYN\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"\n",
"df_train1 = df_train1[['Price', 'Levy', 'Mileage', 'Prod. year', 'Mileage', 'new_rating']].copy()\n",
"\n",
"ada = ADASYN()\n",
"undersampler = RandomUnderSampler(random_state=42)\n",
"\n",
"print(\"Выборка до oversampling и undersampling (датасет 1):\", df_train1.shape)\n",
"print(df_train1['new_rating'].value_counts())\n",
"\n",
"X_resampled, y_resampled = ada.fit_resample(df_train1, df_train1['new_rating'])\n",
"df_train1_adasyn = pd.DataFrame(X_resampled)\n",
"\n",
"print(\"Выборка после oversampling (датасет 1): \", df_train1_adasyn.shape)\n",
"print(df_train1_adasyn.new_rating.value_counts())\n",
"\n",
"X_resampled_under, y_resampled_under = undersampler.fit_resample(df_train1, df_train1['new_rating'])\n",
"\n",
"print(\"Выборка после undersampling (датасет 1): \", pd.DataFrame(X_resampled_under).shape)\n",
"print(pd.DataFrame(X_resampled_under)['new_rating'].value_counts())\n"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Выборка до oversampling и undersampling: (537, 5)\n",
"new_high\n",
"3 253\n",
"2 167\n",
"4 105\n",
"1 8\n",
"5 4\n",
"Name: count, dtype: int64\n",
"Выборка после oversampling: (1265, 5)\n",
"new_high\n",
"1 253\n",
"2 253\n",
"3 253\n",
"4 253\n",
"5 253\n",
"Name: count, dtype: int64\n",
"Выборка после undersampling: (20, 5)\n",
"new_high\n",
"1 4\n",
"2 4\n",
"3 4\n",
"4 4\n",
"5 4\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"from imblearn.over_sampling import SMOTE\n",
"df_train2 = df_train2[['Store_Sales', 'Store_Area', 'Items_Available', 'Daily_Customer_Count', 'new_high']].copy()\n",
"\n",
"smote = SMOTE(random_state=42, k_neighbors=2)\n",
"undersampler = RandomUnderSampler(random_state=42)\n",
"\n",
"print(\"Выборка до oversampling и undersampling:\", df_train2.shape)\n",
"print(df_train2['new_high'].value_counts())\n",
"\n",
"X_resampled, y_resampled = smote.fit_resample(df_train2, df_train2['new_high'])\n",
"df_train2_smote = pd.DataFrame(X_resampled, columns=df_train2.columns)\n",
"\n",
"print(\"Выборка после oversampling:\", df_train2_smote.shape)\n",
"print(df_train2_smote['new_high'].value_counts())\n",
"\n",
"X_resampled_under2, y_resampled_under2 = undersampler.fit_resample(df_train2, df_train2['new_high'])\n",
"df_train2_under = pd.DataFrame(X_resampled_under2, columns=df_train2.columns)\n",
"\n",
"print(\"Выборка после undersampling:\", df_train2_under.shape)\n",
"print(df_train2_under['new_high'].value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Выборка до oversampling и undersampling (датасет 3): (929, 6)\n",
"new_price\n",
"1 428\n",
"2 366\n",
"4 98\n",
"3 37\n",
"5 0\n",
"Name: count, dtype: int64\n",
"Выборка после oversampling (датасет 3): (1712, 6)\n",
"new_price\n",
"1 428\n",
"2 428\n",
"3 428\n",
"4 428\n",
"5 0\n",
"Name: count, dtype: int64\n",
"Выборка после undersampling (датасет 3): (148, 6)\n",
"new_price\n",
"1 37\n",
"2 37\n",
"3 37\n",
"4 37\n",
"5 0\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"from imblearn.over_sampling import RandomOverSampler\n",
"\n",
"df_train3 = df_train3[['Open', 'High', 'Low', 'Close', 'Volume', 'new_price']].copy()\n",
"\n",
"oversampler = RandomOverSampler(random_state=42)\n",
"undersampler = RandomUnderSampler(random_state=42)\n",
"\n",
"print(\"Выборка до oversampling и undersampling (датасет 3):\", df_train3.shape)\n",
"print(df_train3['new_price'].value_counts())\n",
"\n",
"X_resampled, y_resampled = oversampler.fit_resample(df_train3, df_train3['new_price'])\n",
"df_train3_oversampled = pd.DataFrame(X_resampled, columns=df_train3.columns)\n",
"\n",
"print(\"Выборка после oversampling (датасет 3):\", df_train3_oversampled.shape)\n",
"print(df_train3_oversampled['new_price'].value_counts())\n",
"\n",
"X_resampled_under, y_resampled_under = undersampler.fit_resample(df_train3, df_train3['new_price'])\n",
"df_train3_under = pd.DataFrame(X_resampled_under, columns=df_train3.columns)\n",
"\n",
"print(\"Выборка после undersampling (датасет 3):\", df_train3_under.shape)\n",
"print(df_train3_under['new_price'].value_counts())\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}