diff --git a/.gitignore b/.gitignore index 880fad8..f53ce5a 100644 --- a/.gitignore +++ b/.gitignore @@ -208,3 +208,8 @@ kernel/share/jupyter/kernels/python3/logo-64x64.png kernel/share/jupyter/kernels/python3/logo-svg.svg kernel/share/man/man1/ipython.1 kernel/share/man/man1/ttx.1 +lab_2/datasets/game_reviews.csv +lab_2/datasets/laptop.csv +lab_2/datasets/Popular_PL.csv +lab_2/datasets/coffee.csv +lab_2/datasets/car_price_prediction.csv diff --git a/lab_2/lab_2.ipynb b/lab_2/lab_2.ipynb new file mode 100644 index 0000000..dc64ebb --- /dev/null +++ b/lab_2/lab_2.ipynb @@ -0,0 +1,821 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Lab2 PIbd-31 Yakovlev\n", + "Загрузим три датасета" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 532 entries, 0 to 531\n", + "Data columns (total 18 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 brand_name 532 non-null object \n", + " 1 price 532 non-null int64 \n", + " 2 rating 532 non-null int64 \n", + " 3 processor_gen 520 non-null object \n", + " 4 processor_brand 532 non-null object \n", + " 5 processor_segment 528 non-null object \n", + " 6 CPU_mark 532 non-null object \n", + " 7 CPU_performance 532 non-null object \n", + " 8 Graphic_card_memory 530 non-null object \n", + " 9 graphic_card_name 530 non-null object \n", + " 10 graphic_card_num 532 non-null object \n", + " 11 Core 530 non-null float64\n", + " 12 threads 514 non-null float64\n", + " 13 display_inches 532 non-null object \n", + " 14 ram_storage 532 non-null int64 \n", + " 15 ram_type 532 non-null object \n", + " 16 operating_system 502 non-null float64\n", + " 17 SSD_storage 532 non-null object \n", + "dtypes: float64(3), int64(3), object(12)\n", + "memory usage: 74.9+ KB\n", + "\n", + "RangeIndex: 8036 entries, 0 to 8035\n", + "Data columns (total 7 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Date 8036 non-null object \n", + " 1 Open 8036 non-null float64\n", + " 2 High 8036 non-null float64\n", + " 3 Low 8036 non-null float64\n", + " 4 Close 8036 non-null float64\n", + " 5 Adj Close 8036 non-null float64\n", + " 6 Volume 8036 non-null int64 \n", + "dtypes: float64(5), int64(1), object(1)\n", + "memory usage: 439.6+ KB\n", + "\n", + "RangeIndex: 19237 entries, 0 to 19236\n", + "Data columns (total 18 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 ID 19237 non-null int64 \n", + " 1 Price 19237 non-null int64 \n", + " 2 Levy 19237 non-null object \n", + " 3 Manufacturer 19237 non-null object \n", + " 4 Model 19237 non-null object \n", + " 5 Prod. year 19237 non-null int64 \n", + " 6 Category 19237 non-null object \n", + " 7 Leather interior 19237 non-null object \n", + " 8 Fuel type 19237 non-null object \n", + " 9 Engine volume 19237 non-null object \n", + " 10 Mileage 19237 non-null object \n", + " 11 Cylinders 19237 non-null float64\n", + " 12 Gear box type 19237 non-null object \n", + " 13 Drive wheels 19237 non-null object \n", + " 14 Doors 19237 non-null object \n", + " 15 Wheel 19237 non-null object \n", + " 16 Color 19237 non-null object \n", + " 17 Airbags 19237 non-null int64 \n", + "dtypes: float64(1), int64(4), object(13)\n", + "memory usage: 2.6+ MB\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"datasets/laptop.csv\")\n", + "df2 = pd.read_csv(\"datasets/coffee.csv\")\n", + "df3 = pd.read_csv(\"datasets/car_price_prediction.csv\")\n", + "df.info()\n", + "df2.info()\n", + "df3.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Проблемная область\n", + "Первый датасет позволяет проанализировать данные, и понять какие ноутбуки имеют превосходство на рынке техники, и какие чаще всего выбирают пользователи.\n", + "Второй датасет позволяет при помощи данных акций за последние 25 лет спрогнозировать будущие показатели акций кофейни Starbucks\n", + "Третий датасет позволяет проанализировать данные, и спрогнозировать категорию цены для машины, по ее комплектующим.\n", + "#### Анализ набора данных\n", + "Объекты наблюдения - игровые ноутбуки, акции, машины\n", + "Атрибуты - \n", + "1. Имя бренда, цена, рейтинг, поколение процессора, марка процессора, сегмент процессора, специальная метка, производительность процессора, видеокарта, видеопамять, номер видеокарты, кол-во потоков видеокарты, размер дисплея, оперативная память, тип оперативной памяти, ОС, SSD.\n", + "2. Дата, начальная цена за день, максимальная цена, минимальная цена, цена на момент закрытия продаж, скорректированая цена на момент закрытия, объем торговли акций за день.\n", + "3. Цена обслуживания, производитель, модель, год выпуска, категория, кожанный салон, тип топлива, объем двигателя.\n", + "Связи между объектами - нет\n", + "#### Бизнес-цели\n", + "1. Какие модели игровых ноутбуков более выгодно продавать магазинам техники. Какие комплектующие наиболее популярны у производителей и покупателей, для дальнейшего увеличения производства данных комплектующих.\n", + "2. Прогноз цен, для дальнешей покупки, продажи акций. Прогнозирование, для предотвращения упадка.\n", + "3. Для составления списка лучших моделей автомобилей. Определения наилучшего буджетного автомобиля, который не будет часто ломаться и приносить убытки.\n", + "#### Примеры целей технического проекта. Что поступает на вход, что является целевым признаком.\n", + "На входе всегда датасет, целевые признаки:\n", + "1. Рейтинг ноутбука\n", + "2. Максимальная цена за день\n", + "3. Цена обслуживания\n", + "#### Проблемы набора данных и их решения\n", + "1. Возможны устаревшие данные. Для решения данной проблемы требуется удаление самых старых записей, и добавление более новых.\n", + "2. Возможны выбросы. Решить эту проблему помогает удаление таких выбросов. В маленькой выборке не рекомендуется удалять выбросы. В большой выборке выбросы усредняются.\n", + "#### Качество набора данных\n", + "Наборы данных содержат достаточно примеров и признаков для обучения модели. Учтены различные ситуации проблемной области. Данные соответствуют данным, которые будут\n", + "подаваться в производственной среде. Все метки согласованы.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Поиск аномалий" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " price rating Core threads ram_storage \\\n", + "count 532.000000 532.000000 530.000000 514.000000 532.000000 \n", + "mean 107684.492481 67.781955 9.035849 15.089494 15.676692 \n", + "std 80187.648965 8.161356 4.413487 5.216162 8.901257 \n", + "min 30999.000000 43.000000 2.000000 4.000000 4.000000 \n", + "25% 62371.750000 63.000000 6.000000 12.000000 8.000000 \n", + "50% 83745.000000 66.000000 8.000000 16.000000 16.000000 \n", + "75% 114040.000000 72.000000 10.000000 16.000000 16.000000 \n", + "max 599990.000000 98.000000 24.000000 32.000000 64.000000 \n", + "\n", + " operating_system \n", + "count 502.000000 \n", + "mean 10.842629 \n", + "std 0.364513 \n", + "min 10.000000 \n", + "25% 11.000000 \n", + "50% 11.000000 \n", + "75% 11.000000 \n", + "max 11.000000 \n", + " Open High Low Close Adj Close \\\n", + "count 8036.000000 8036.000000 8036.000000 8036.000000 8036.000000 \n", + "mean 30.054280 30.351487 29.751322 30.058857 26.674025 \n", + "std 33.615577 33.906613 33.314569 33.615911 31.728090 \n", + "min 0.328125 0.347656 0.320313 0.335938 0.260703 \n", + "25% 4.392031 4.531250 4.304922 4.399610 3.414300 \n", + "50% 13.325000 13.493750 13.150000 13.330000 10.352452 \n", + "75% 55.250000 55.722501 54.852499 55.267499 47.464829 \n", + "max 126.080002 126.320000 124.809998 126.059998 118.010414 \n", + "\n", + " Volume \n", + "count 8.036000e+03 \n", + "mean 1.470459e+07 \n", + "std 1.340021e+07 \n", + "min 1.504000e+06 \n", + "25% 7.817750e+06 \n", + "50% 1.169815e+07 \n", + "75% 1.778795e+07 \n", + "max 5.855088e+08 \n", + " ID Price Prod. year Cylinders Airbags\n", + "count 1.923700e+04 1.923700e+04 19237.000000 19237.000000 19237.000000\n", + "mean 4.557654e+07 1.855593e+04 2010.912824 4.582991 6.582627\n", + "std 9.365914e+05 1.905813e+05 5.668673 1.199933 4.320168\n", + "min 2.074688e+07 1.000000e+00 1939.000000 1.000000 0.000000\n", + "25% 4.569837e+07 5.331000e+03 2009.000000 4.000000 4.000000\n", + "50% 4.577231e+07 1.317200e+04 2012.000000 4.000000 6.000000\n", + "75% 4.580204e+07 2.207500e+04 2015.000000 4.000000 12.000000\n", + "max 4.581665e+07 2.630750e+07 2020.000000 16.000000 16.000000\n" + ] + } + ], + "source": [ + "print(df.describe())\n", + "print(df2.describe())\n", + "print(df3.describe())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "При просмотре вывода не было замечено аномалий в столбцах датасетов." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Проблема пропущенных данных" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DATASET 1\n", + "processor_gen процент пустых значений: %2.26\n", + "processor_segment процент пустых значений: %0.75\n", + "Graphic_card_memory процент пустых значений: %0.38\n", + "graphic_card_name процент пустых значений: %0.38\n", + "Core процент пустых значений: %0.38\n", + "threads процент пустых значений: %3.38\n", + "operating_system процент пустых значений: %5.64\n", + "DATASET 2\n", + "DATASET 3\n" + ] + } + ], + "source": [ + "print(\"DATASET 1\")\n", + "for i in df.columns:\n", + " null_rate = df[i].isnull().sum() / len(df)*100\n", + " if null_rate > 0:\n", + " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n", + "print(\"DATASET 2\")\n", + "for i in df2.columns:\n", + " null_rate = df2[i].isnull().sum() / len(df2)*100\n", + " if null_rate > 0:\n", + " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n", + "print(\"DATASET 3\")\n", + "for i in df3.columns:\n", + " null_rate = df3[i].isnull().sum() / len(df3)*100\n", + " if null_rate > 0:\n", + " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В первом датасете были поля с пустыми значениями, в остальных пустых значений не найдено." + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "brand_name False\n", + "price False\n", + "rating False\n", + "processor_gen False\n", + "processor_brand False\n", + "processor_segment False\n", + "CPU_mark False\n", + "CPU_performance False\n", + "Graphic_card_memory False\n", + "graphic_card_name False\n", + "graphic_card_num False\n", + "Core False\n", + "threads False\n", + "display_inches False\n", + "ram_storage False\n", + "ram_type False\n", + "operating_system False\n", + "SSD_storage False\n", + "dtype: bool\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
brand_namepriceratingprocessor_genprocessor_brandprocessor_segmentCPU_markCPU_performanceGraphic_card_memorygraphic_card_namegraphic_card_numCorethreadsdisplay_inchesram_storageram_typeoperating_systemSSD_storage
527dell75500634thamd54600Hmaximum performance6 GBamd radeonother6.012.015.68DDR410.0512 GB SSD
528lenovo1519907510thinteli710875Hmaximum performance8 GBnvidia geforceother8.016.015.616DDR410.01 TB SSD
529lenovo46500488thinteli58250Uultra-low powerIntegratedIntel Integratedother4.08.0other4DDR40.0other
530msi109990619thinteli79750Hmaximum performance6 GBnvidia geforceother6.012.0other8other0.0other
531hp95800709thinteli79750Hmaximum performance4 GBnvidia geforce16506.012.015.68DDR40.0other
\n", + "
" + ], + "text/plain": [ + " brand_name price rating processor_gen processor_brand \\\n", + "527 dell 75500 63 4th amd \n", + "528 lenovo 151990 75 10th intel \n", + "529 lenovo 46500 48 8th intel \n", + "530 msi 109990 61 9th intel \n", + "531 hp 95800 70 9th intel \n", + "\n", + " processor_segment CPU_mark CPU_performance Graphic_card_memory \\\n", + "527 5 4600H maximum performance 6 GB \n", + "528 i7 10875H maximum performance 8 GB \n", + "529 i5 8250U ultra-low power Integrated \n", + "530 i7 9750H maximum performance 6 GB \n", + "531 i7 9750H maximum performance 4 GB \n", + "\n", + " graphic_card_name graphic_card_num Core threads display_inches \\\n", + "527 amd radeon other 6.0 12.0 15.6 \n", + "528 nvidia geforce other 8.0 16.0 15.6 \n", + "529 Intel Integrated other 4.0 8.0 other \n", + "530 nvidia geforce other 6.0 12.0 other \n", + "531 nvidia geforce 1650 6.0 12.0 15.6 \n", + "\n", + " ram_storage ram_type operating_system SSD_storage \n", + "527 8 DDR4 10.0 512 GB SSD \n", + "528 16 DDR4 10.0 1 TB SSD \n", + "529 4 DDR4 0.0 other \n", + "530 8 other 0.0 other \n", + "531 8 DDR4 0.0 other " + ] + }, + "execution_count": 127, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.fillna(0) #Замена пустых значений на 0\n", + "print(df.isnull().any())\n", + "df.tail()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Разбиение на выборки\n", + "Для разбиения на выборке для начала уменьшим количество уникальных значений в столбцах с целевыми признаками, путем добавления новых столбцов с малым количеством уникальных значений." + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [], + "source": [ + "#У первого дата сета добавим новый столбец с рейтингом от 1 до 5 на основе столбца от 1 до 100.\n", + "\n", + "df['new_rating'] = pd.cut(df['rating'], bins=[0,20,40,60,80,100], labels=[1,2,3,4,5], include_lowest=True)\n", + "\n", + "#У второго добавим столбец с наибольшей ценой от 1 до 10, на основе столбца от 1 до 127.\n", + "\n", + "df2['new_high'] = pd.cut(df2['High'], bins=[0,13,26,39,52,65,78,91,104,117,130], labels=[1,2,3,4,5,6,7,8,9,10], include_lowest=True)\n", + "\n", + "#У третьего удалим слишком большие значения обслуживания и слишком маленькие и добавим новый столбец с категориями цен от 1 до 5.\n", + "\n", + "df3_filtered = df3[df3['Price'] >= 10000]\n", + "df3_filtered = df3_filtered[df3_filtered['Price'] <= 100000]\n", + "df3_filtered['new_price'] = pd.cut(df3_filtered['Price'], bins=[10000,28000,46000,64000,82000,100000], labels=[1,2,3,4,5], include_lowest=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "def split_stratified_into_train_val_test(\n", + " df_input,\n", + " stratify_colname=\"y\",\n", + " frac_train=0.6,\n", + " frac_val=0.15,\n", + " frac_test=0.25,\n", + " random_state=None,\n", + "):\n", + " if frac_train + frac_val + frac_test != 1.0:\n", + " raise ValueError(\n", + " \"fractions %f, %f, %f do not add up to 1.0\"\n", + " % (frac_train, frac_val, frac_test)\n", + " )\n", + "\n", + " if stratify_colname not in df_input.columns:\n", + " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", + "\n", + " X = df_input # Contains all columns.\n", + " y = df_input[\n", + " [stratify_colname]\n", + " ] # Dataframe of just the column on which to stratify.\n", + "\n", + " # Split original dataframe into train and temp dataframes.\n", + " df_train, df_temp, y_train, y_temp = train_test_split(\n", + " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", + " )\n", + "\n", + " # Split the temp dataframe into val and test dataframes.\n", + " relative_frac_test = frac_test / (frac_val + frac_test)\n", + " df_val, df_test, y_val, y_test = train_test_split(\n", + " df_temp,\n", + " y_temp,\n", + " stratify=y_temp,\n", + " test_size=relative_frac_test,\n", + " random_state=random_state,\n", + " )\n", + "\n", + " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", + "\n", + " return df_train, df_val, df_test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Выборки датасетов" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train df: (319, 19), Val df: (106, 19), Test df:(107, 19)\n", + "train df2: (4821, 8), Val df2: (1607, 8), Test df2:(1608, 8)\n", + "train df3_filtered: (6931, 19), Val df3_filtered: (2310, 19), Test df3_filtered:(2311, 19)\n" + ] + } + ], + "source": [ + "df_train1, df_val1, df_test1 = split_stratified_into_train_val_test(\n", + " df, stratify_colname=\"new_rating\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", + ")\n", + "\n", + "df_train2, df_val2, df_test2 = split_stratified_into_train_val_test(\n", + " df2, stratify_colname=\"new_high\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", + ")\n", + "\n", + "df_train3, df_val3, df_test3 = split_stratified_into_train_val_test(\n", + " df3_filtered, stratify_colname=\"new_price\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", + ")\n", + "\n", + "print(f\"train df: {df_train1.shape}, Val df: {df_val1.shape}, Test df:{df_test1.shape}\")\n", + "print(f\"train df2: {df_train2.shape}, Val df2: {df_val2.shape}, Test df2:{df_test2.shape}\")\n", + "print(f\"train df3_filtered: {df_train3.shape}, Val df3_filtered: {df_val3.shape}, Test df3_filtered:{df_test3.shape}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Было сделано разбиение на три выборки: 60%, 20% и 20% при помощи библиотеки scikit-learn и функции train_test_split. На взгляд сбалансированные\n", + "### Приращение методами выборки с избытком (oversampling) и выборки с недостатком (undersampling)" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Выборка до oversampling и undersampling: (319, 6)\n", + "new_rating\n", + "4 249\n", + "3 44\n", + "5 26\n", + "1 0\n", + "2 0\n", + "Name: count, dtype: int64\n", + "Выборка после oversampling: (750, 6)\n", + "new_rating\n", + "5 251\n", + "3 250\n", + "4 249\n", + "1 0\n", + "2 0\n", + "Name: count, dtype: int64\n", + "Выборка после undersampling: (78, 6)\n", + "new_rating\n", + "3 26\n", + "5 26\n", + "4 26\n", + "1 0\n", + "2 0\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "from imblearn.over_sampling import ADASYN\n", + "from imblearn.under_sampling import RandomUnderSampler\n", + "\n", + "df_train1 = df_train1[['price', 'rating', 'threads', 'ram_storage', 'operating_system', 'new_rating']].copy()\n", + "\n", + "ada = ADASYN()\n", + "undersampler = RandomUnderSampler(random_state=42)\n", + "\n", + "print(\"Выборка до oversampling и undersampling:\", df_train1.shape)\n", + "print(df_train1.new_rating.value_counts())\n", + "\n", + "X_resampled, y_resampled = ada.fit_resample(df_train1, df_train1['new_rating'])\n", + "df_train1_adasyn = pd.DataFrame(X_resampled)\n", + "\n", + "print(\"Выборка после oversampling: \", df_train1_adasyn.shape)\n", + "print(df_train1_adasyn.new_rating.value_counts())\n", + "\n", + "X_resampled_under, y_resampled_under = undersampler.fit_resample(df_train1, df_train1['new_rating'])\n", + "\n", + "print(\"Выборка после undersampling: \", pd.DataFrame(X_resampled_under).shape)\n", + "print(pd.DataFrame(X_resampled_under).new_rating.value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Выборка до oversampling и undersampling: (4821, 7)\n", + "new_high\n", + "1 2326\n", + "2 704\n", + "5 519\n", + "3 299\n", + "8 242\n", + "7 222\n", + "9 181\n", + "4 151\n", + "6 146\n", + "10 31\n", + "Name: count, dtype: int64\n", + "Выборка после oversampling: (23144, 7)\n", + "new_high\n", + "8 2374\n", + "6 2368\n", + "2 2351\n", + "4 2335\n", + "1 2326\n", + "9 2317\n", + "10 2312\n", + "5 2256\n", + "7 2256\n", + "3 2249\n", + "Name: count, dtype: int64\n", + "Выборка после undersampling: (310, 7)\n", + "new_high\n", + "1 31\n", + "2 31\n", + "3 31\n", + "4 31\n", + "5 31\n", + "6 31\n", + "7 31\n", + "8 31\n", + "9 31\n", + "10 31\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "df_train2 = df_train2[['Open', 'High', 'new_high', 'Low', 'Close', 'Adj Close', 'Volume']].copy()\n", + "\n", + "print(\"Выборка до oversampling и undersampling:\", df_train2.shape)\n", + "print(df_train2.new_high.value_counts())\n", + "\n", + "X_resampled, y_resampled = ada.fit_resample(df_train2, df_train2['new_high'])\n", + "df_train2_adasyn = pd.DataFrame(X_resampled)\n", + "\n", + "print(\"Выборка после oversampling: \", df_train2_adasyn.shape)\n", + "print(df_train2_adasyn.new_high.value_counts())\n", + "\n", + "X_resampled_under2, y_resampled_under2 = undersampler.fit_resample(df_train2, df_train2['new_high'])\n", + "\n", + "print(\"Выборка после undersampling: \", pd.DataFrame(X_resampled_under2).shape)\n", + "print(pd.DataFrame(X_resampled_under2).new_high.value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Выборка до oversampling и undersampling: (6931, 5)\n", + "new_price\n", + "1 5008\n", + "2 1281\n", + "3 449\n", + "4 136\n", + "5 57\n", + "Name: count, dtype: int64\n", + "Выборка после oversampling: (25040, 5)\n", + "new_price\n", + "1 5008\n", + "2 5008\n", + "3 5008\n", + "4 5008\n", + "5 5008\n", + "Name: count, dtype: int64\n", + "Выборка после undersampling: (285, 5)\n", + "new_price\n", + "1 57\n", + "2 57\n", + "3 57\n", + "4 57\n", + "5 57\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "from imblearn.over_sampling import SMOTE\n", + "\n", + "df_train3 = df_train3[['Price', 'new_price','Prod. year' ,'Cylinders' ,'Airbags']].copy()\n", + "\n", + "smote = SMOTE(random_state=42)\n", + "\n", + "print(\"Выборка до oversampling и undersampling:\", df_train3.shape)\n", + "print(df_train3.new_price.value_counts())\n", + "\n", + "X_resampled, y_resampled = smote.fit_resample(df_train3, df_train3['new_price'])\n", + "df_train3_smote = pd.DataFrame(X_resampled)\n", + "\n", + "print(\"Выборка после oversampling: \", df_train3_smote.shape)\n", + "print(df_train3_smote.new_price.value_counts())\n", + "\n", + "X_resampled_under3, y_resampled_under3 = undersampler.fit_resample(df_train3, df_train3['new_price'])\n", + "\n", + "print(\"Выборка после undersampling: \", pd.DataFrame(X_resampled_under3).shape)\n", + "print(pd.DataFrame(X_resampled_under3).new_price.value_counts())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "kernel", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lab_2/requirements.txt b/lab_2/requirements.txt new file mode 100644 index 0000000..f17ad4b Binary files /dev/null and b/lab_2/requirements.txt differ