From 792fbe75b9fa8a02bf0c7fbcc8b7eda69bea1ef7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A2=D0=B0=D0=B1=D0=B5=D0=B5=D0=B2=20=D0=90=D0=BB=D0=B5?= =?UTF-8?q?=D0=BA=D1=81=D0=B0=D0=BD=D0=B4=D1=80?= Date: Fri, 1 Nov 2024 20:29:45 +0400 Subject: [PATCH 1/2] upd gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index aa5d4b2..b8dd8eb 100644 --- a/.gitignore +++ b/.gitignore @@ -174,3 +174,6 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +lab_2/datasets_lab2/coffee.csv +lab_2/datasets_lab2/Stores.csv +lab_2/datasets_lab2/StudentsPerformance.csv -- 2.25.1 From 9b2346eabcc54f2b381b356f3c0d91df2f8a4ed8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A2=D0=B0=D0=B1=D0=B5=D0=B5=D0=B2=20=D0=90=D0=BB=D0=B5?= =?UTF-8?q?=D0=BA=D1=81=D0=B0=D0=BD=D0=B4=D1=80?= Date: Fri, 1 Nov 2024 20:30:19 +0400 Subject: [PATCH 2/2] lab 2 is done --- README.md | 1 + lab_1/lab1.ipynb | 2 +- lab_2/lab2.ipynb | 700 +++++++++++++++++++++++++++++++++++++++++ lab_2/requirements.txt | Bin 0 -> 1590 bytes 4 files changed, 702 insertions(+), 1 deletion(-) create mode 100644 lab_2/lab2.ipynb create mode 100644 lab_2/requirements.txt diff --git a/README.md b/README.md index 066bf1d..b2de7ea 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ # AIM_PIbd-31_Tabeev_A.P +# Табеев Александр Павлович # Вариант 3 # https://clck.yandex.ru/redir/nWO_r1F33ck?data=NnBZTWRhdFZKOHRaTENSMFc4S0VQUGMtSXk0bDRzNnVpakFkYjNNRE5ZNFRuVk4yRGpQaHlFSHNPNVpscDY1RGxPdmF0UFlUU3V4cmpoNDBvcE5vQVAxUzRxUzFpU1YzejluV1ozdUpoS1ZMeWdjcHktYS1IT2diWFhTLWpZcVhkdzF4a25GUGRYZGtSQ3RsclBRV1RiTWJsdFlyNVFIV1MyVHp0NDJTY21Z&b64e=2&sign=0e97a68a5fb67b83ca9ea592b182bbae&keyno=17 \ No newline at end of file diff --git a/lab_1/lab1.ipynb b/lab_1/lab1.ipynb index c022d05..3c36ac9 100644 --- a/lab_1/lab1.ipynb +++ b/lab_1/lab1.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { diff --git a/lab_2/lab2.ipynb b/lab_2/lab2.ipynb new file mode 100644 index 0000000..1d34333 --- /dev/null +++ b/lab_2/lab2.ipynb @@ -0,0 +1,700 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Загрузка датасетов и вывод информации" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 8036 entries, 0 to 8035\n", + "Data columns (total 7 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Date 8036 non-null object \n", + " 1 Open 8036 non-null float64\n", + " 2 High 8036 non-null float64\n", + " 3 Low 8036 non-null float64\n", + " 4 Close 8036 non-null float64\n", + " 5 Adj Close 8036 non-null float64\n", + " 6 Volume 8036 non-null int64 \n", + "dtypes: float64(5), int64(1), object(1)\n", + "memory usage: 439.6+ KB\n", + "\n", + "RangeIndex: 896 entries, 0 to 895\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 Store ID 896 non-null int64\n", + " 1 Store_Area 896 non-null int64\n", + " 2 Items_Available 896 non-null int64\n", + " 3 Daily_Customer_Count 896 non-null int64\n", + " 4 Store_Sales 896 non-null int64\n", + "dtypes: int64(5)\n", + "memory usage: 35.1 KB\n", + "\n", + "RangeIndex: 1000 entries, 0 to 999\n", + "Data columns (total 8 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 gender 1000 non-null object\n", + " 1 race/ethnicity 1000 non-null object\n", + " 2 parental level of education 1000 non-null object\n", + " 3 lunch 1000 non-null object\n", + " 4 test preparation course 1000 non-null object\n", + " 5 math score 1000 non-null int64 \n", + " 6 reading score 1000 non-null int64 \n", + " 7 writing score 1000 non-null int64 \n", + "dtypes: int64(3), object(5)\n", + "memory usage: 62.6+ KB\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "df = pd.read_csv(\"datasets_lab2/coffee.csv\")\n", + "df2 = pd.read_csv(\"datasets_lab2/Stores.csv\")\n", + "df3 = pd.read_csv(\"datasets_lab2/StudentsPerformance.csv\")\n", + "df.info()\n", + "df2.info()\n", + "df3.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Проблемная область" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Первый датасет coffee.csv позволяет спрогнозировать будущие показатели акций кофейни Starbucks. Второй датасет Stores.csv - магазины. Третий датасет StudentsPerformance.csv - успеваемость студентов на экзамене" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Анализ набора данных" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Объекты: \n", + "1. Акции\n", + "2. Магазины\n", + "3. Успеваемость студентов\n", + "Атрибуты: \n", + "1. Дата; начальная цена за день; максимальная цена; минимальная цена; цена на момент закрытия продаж; скорректированая цена на момент закрытия; объем торговли акций за день.\n", + "2. Идентификатор магазина; физическая площадь; кол-во доступных товаров; количество покупателей, посетивших магазины в среднем за месяц; продажив магазинах (в долларах США).\n", + "3. Пол, раса/этническая принадлежность, уровень образования родителей, обед, курс подготовки к тестированию, оценка по математике, оценка по чтению, оценка по письму." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Бизнес цели" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. Анализ показателей акций Starbucks — прогнозирование будущей стоимости акций для улучшения финансовых решений, предотвращения резких убытков и повышения прибыли.\n", + "2. Оценка эффективности магазинов — понимание факторов, влияющих на успешность магазинов, включая выручку и посещаемость, что поможет принимать решения о возможном расширении сети или оптимизации текущей стратегии размещения.\n", + "3. Анализ образовательной успеваемости студентов — выявление факторов, влияющих на академические успехи для разработки программ поддержки студентов." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Примеры целей технического проекта. Что поступает на вход, что является целевым признаком." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "На входе будут переданы следующие датасеты, целевые признаки для каждого:\n", + "1. coffee.csv — максимальная цена акций за день.\n", + "2. Stores.csv — выручка магазина.\n", + "3. StudentsPerformance.csv — общий средний балл (среднее значение по математике, чтению и письму)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Проблемы набора данных и их решения" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. Устаревшие данные: Важно проверять актуальность данных, особенно для финансовых временных рядов и данных о магазинах. Для этого старые записи могут быть удалены или обновлены, если доступны более свежие данные.\n", + "2. Выбросы: Необходимо выявить аномалии, такие как резкие изменения в ценах акций или посещаемости, и принять решение об их удалении или сглаживании, исходя из размера выборки." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Качество набора данных" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Наборы данных содержат достаточно примеров и признаков для обучения модели. Учтены различные ситуации проблемной области. Данные соответствуют данным, которые будут подаваться в производственной среде. Все метки согласованы." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Поиск аномалий" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Open High Low Close Adj Close \\\n", + "count 8036.000000 8036.000000 8036.000000 8036.000000 8036.000000 \n", + "mean 30.054280 30.351487 29.751322 30.058857 26.674025 \n", + "std 33.615577 33.906613 33.314569 33.615911 31.728090 \n", + "min 0.328125 0.347656 0.320313 0.335938 0.260703 \n", + "25% 4.392031 4.531250 4.304922 4.399610 3.414300 \n", + "50% 13.325000 13.493750 13.150000 13.330000 10.352452 \n", + "75% 55.250000 55.722501 54.852499 55.267499 47.464829 \n", + "max 126.080002 126.320000 124.809998 126.059998 118.010414 \n", + "\n", + " Volume \n", + "count 8.036000e+03 \n", + "mean 1.470459e+07 \n", + "std 1.340021e+07 \n", + "min 1.504000e+06 \n", + "25% 7.817750e+06 \n", + "50% 1.169815e+07 \n", + "75% 1.778795e+07 \n", + "max 5.855088e+08 \n", + " Store ID Store_Area Items_Available Daily_Customer_Count \\\n", + "count 896.000000 896.000000 896.000000 896.000000 \n", + "mean 448.500000 1485.409598 1782.035714 786.350446 \n", + "std 258.797218 250.237011 299.872053 265.389281 \n", + "min 1.000000 775.000000 932.000000 10.000000 \n", + "25% 224.750000 1316.750000 1575.500000 600.000000 \n", + "50% 448.500000 1477.000000 1773.500000 780.000000 \n", + "75% 672.250000 1653.500000 1982.750000 970.000000 \n", + "max 896.000000 2229.000000 2667.000000 1560.000000 \n", + "\n", + " Store_Sales \n", + "count 896.000000 \n", + "mean 59351.305804 \n", + "std 17190.741895 \n", + "min 14920.000000 \n", + "25% 46530.000000 \n", + "50% 58605.000000 \n", + "75% 71872.500000 \n", + "max 116320.000000 \n", + " math score reading score writing score\n", + "count 1000.00000 1000.000000 1000.000000\n", + "mean 66.08900 69.169000 68.054000\n", + "std 15.16308 14.600192 15.195657\n", + "min 0.00000 17.000000 10.000000\n", + "25% 57.00000 59.000000 57.750000\n", + "50% 66.00000 70.000000 69.000000\n", + "75% 77.00000 79.000000 79.000000\n", + "max 100.00000 100.000000 100.000000\n" + ] + } + ], + "source": [ + "print(df.describe())\n", + "print(df2.describe())\n", + "print(df3.describe())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "При просмотре вывода не было замечено аномалий в столбцах датасетов." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Проблема пропущенных данных" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Первый датасет coffee.csv\n", + "Второй датасет Stores.csv\n", + "Третий датасет StudentsPerformance.csv\n" + ] + } + ], + "source": [ + "print(\"Первый датасет coffee.csv\")\n", + "for i in df.columns:\n", + " null_rate = df[i].isnull().sum() / len(df)*100\n", + " if null_rate > 0:\n", + " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n", + "print(\"Второй датасет Stores.csv\")\n", + "for i in df2.columns:\n", + " null_rate = df2[i].isnull().sum() / len(df2)*100\n", + " if null_rate > 0:\n", + " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n", + "print(\"Третий датасет StudentsPerformance.csv\")\n", + "for i in df3.columns:\n", + " null_rate = df3[i].isnull().sum() / len(df3)*100\n", + " if null_rate > 0:\n", + " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Во всех трех датасетах пустых значений не найдено, по наполненности датасеты удовлетворительны. Но можно добавить побольше столбцов в датасет магазинов." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В первом датасете coffee.csv удалим строки, где год ниже 2000." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Date Open High Low Close Adj Close \\\n", + "1899 2000-01-03 2.984375 3.085938 2.906250 3.082031 2.391797 \n", + "1900 2000-01-04 3.007813 3.109375 2.968750 2.984375 2.316012 \n", + "1901 2000-01-05 2.992188 3.078125 2.960938 3.023438 2.346326 \n", + "1902 2000-01-06 3.000000 3.203125 3.000000 3.132813 2.431207 \n", + "1903 2000-01-07 3.093750 3.125000 3.031250 3.117188 2.419082 \n", + "... ... ... ... ... ... ... \n", + "8031 2024-05-17 75.269997 78.000000 74.919998 77.849998 77.849998 \n", + "8032 2024-05-20 77.680000 78.320000 76.709999 77.540001 77.540001 \n", + "8033 2024-05-21 77.559998 78.220001 77.500000 77.720001 77.720001 \n", + "8034 2024-05-22 77.699997 81.019997 77.440002 80.720001 80.720001 \n", + "8035 2024-05-23 80.099998 80.699997 79.169998 79.260002 79.260002 \n", + "\n", + " Volume \n", + "1899 24232000 \n", + "1900 21564800 \n", + "1901 28206400 \n", + "1902 30825600 \n", + "1903 26044800 \n", + "... ... \n", + "8031 14436500 \n", + "8032 11183800 \n", + "8033 8916600 \n", + "8034 22063400 \n", + "8035 4651418 \n", + "\n", + "[6137 rows x 7 columns]\n" + ] + } + ], + "source": [ + "df_filtered = df[df['Date'] > '2000-01-01']\n", + "print(df_filtered)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Во втором датасете Stores.csv всем магазинам поставим среднее значение площади." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Store ID Store_Area Items_Available Daily_Customer_Count Store_Sales\n", + "0 1 1500.0 1961 530 66490\n", + "1 2 1500.0 1752 210 39820\n", + "2 3 1500.0 1609 720 54010\n", + "3 4 1500.0 1748 620 53730\n", + "4 5 1500.0 2111 450 46620\n", + ".. ... ... ... ... ...\n", + "891 892 1500.0 1910 1080 66390\n", + "892 893 1500.0 1663 850 82080\n", + "893 894 1500.0 1436 1060 76440\n", + "894 895 1500.0 1560 770 96610\n", + "895 896 1500.0 1429 1110 54340\n", + "\n", + "[896 rows x 5 columns]\n" + ] + } + ], + "source": [ + "store_area_mean = df2['Store_Area'].mean()\n", + "df2['Store_Area'] = store_area_mean\n", + "print(df2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В третьем датасете StudentsPerformance.csv всем студентам сделаем ланч стандартным." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " gender race/ethnicity parental level of education lunch \\\n", + "0 female group B bachelor's degree standard \n", + "1 female group C some college standard \n", + "2 female group B master's degree standard \n", + "3 male group A associate's degree standard \n", + "4 male group C some college standard \n", + ".. ... ... ... ... \n", + "995 female group E master's degree standard \n", + "996 male group C high school standard \n", + "997 female group C high school standard \n", + "998 female group D some college standard \n", + "999 female group D some college standard \n", + "\n", + " test preparation course math score reading score writing score \n", + "0 none 72 72 74 \n", + "1 completed 69 90 88 \n", + "2 none 90 95 93 \n", + "3 none 47 57 44 \n", + "4 none 76 78 75 \n", + ".. ... ... ... ... \n", + "995 completed 88 99 95 \n", + "996 none 62 55 55 \n", + "997 completed 59 71 65 \n", + "998 completed 68 78 77 \n", + "999 none 77 86 86 \n", + "\n", + "[1000 rows x 8 columns]\n" + ] + } + ], + "source": [ + "df3['lunch'] = 'standard'\n", + "print(df3)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "def split_stratified_into_train_val_test(\n", + " df_input,\n", + " stratify_colname=\"y\",\n", + " frac_train=0.6,\n", + " frac_val=0.15,\n", + " frac_test=0.25,\n", + " random_state=None,\n", + "):\n", + " if frac_train + frac_val + frac_test != 1.0:\n", + " raise ValueError(\n", + " \"fractions %f, %f, %f do not add up to 1.0\"\n", + " % (frac_train, frac_val, frac_test)\n", + " )\n", + "\n", + " if stratify_colname not in df_input.columns:\n", + " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", + "\n", + " X = df_input # Contains all columns.\n", + " y = df_input[\n", + " [stratify_colname]\n", + " ] # Dataframe of just the column on which to stratify.\n", + "\n", + " # Split original dataframe into train and temp dataframes.\n", + " df_train, df_temp, y_train, y_temp = train_test_split(\n", + " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", + " )\n", + "\n", + " # Split the temp dataframe into val and test dataframes.\n", + " relative_frac_test = frac_test / (frac_val + frac_test)\n", + " df_val, df_test, y_val, y_test = train_test_split(\n", + " df_temp,\n", + " y_temp,\n", + " stratify=y_temp,\n", + " test_size=relative_frac_test,\n", + " random_state=random_state,\n", + " )\n", + "\n", + " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", + "\n", + " return df_train, df_val, df_test" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train df: (4821, 7), Validation df: (1607, 7), Test df: (1608, 7)\n", + "Train df2: (537, 5), Validation df2: (179, 5), Test df2: (180, 5)\n", + "Train df3: (600, 8), Validation df3: (200, 8), Test df3: (200, 8)\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)\n", + "val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42) \n", + "\n", + "train_df2, temp_df2 = train_test_split(df2, test_size=0.4, random_state=42)\n", + "val_df2, test_df2 = train_test_split(temp_df2, test_size=0.5, random_state=42)\n", + "\n", + "train_df3, temp_df3 = train_test_split(df3, test_size=0.4, random_state=42)\n", + "val_df3, test_df3 = train_test_split(temp_df3, test_size=0.5, random_state=42)\n", + "print(f\"Train df: {train_df.shape}, Validation df: {val_df.shape}, Test df: {test_df.shape}\")\n", + "print(f\"Train df2: {train_df2.shape}, Validation df2: {val_df2.shape}, Test df2: {test_df2.shape}\")\n", + "print(f\"Train df3: {train_df3.shape}, Validation df3: {val_df3.shape}, Test df3: {test_df3.shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Было сделано разбиение на три выборки: 60%, 20% и 20% при помощи библиотеки scikit-learn и функции train_test_split. На взгляд сбалансированные" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Приращение методами выборки с избытком (oversampling) и выборки с недостатком (undersampling)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Выборка после oversampling (df):\n", + "High_category\n", + "Low 4835\n", + "Medium 4835\n", + "High 4835\n", + "Luxury 4835\n", + "Name: count, dtype: int64\n", + "Выборка после undersampling (df):\n", + "High_category\n", + "Low 321\n", + "Medium 321\n", + "High 321\n", + "Luxury 321\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "from imblearn.over_sampling import RandomOverSampler\n", + "from imblearn.under_sampling import RandomUnderSampler\n", + "\n", + "df = pd.read_csv(\"datasets_lab2/coffee.csv\")\n", + "df['High_category'] = pd.cut(df['High'], bins=[0.1, 20, 60, 105, float('inf')],\n", + " labels=['Low', 'Medium', 'High', 'Luxury'])\n", + "\n", + "y = df['High_category']\n", + "X = df.drop(columns=['High', 'High_category'])\n", + "\n", + "oversampler = RandomOverSampler(random_state=42)\n", + "X_resampled, y_resampled = oversampler.fit_resample(X, y)\n", + "\n", + "undersampler = RandomUnderSampler(random_state=42)\n", + "X_resampled_under, y_resampled_under = undersampler.fit_resample(X, y)\n", + "\n", + "print(\"Выборка после oversampling (df):\")\n", + "print(pd.Series(y_resampled).value_counts())\n", + "\n", + "print(\"Выборка после undersampling (df):\")\n", + "print(pd.Series(y_resampled_under).value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Выборка после oversampling (df2):\n", + "Sales_category\n", + "Low 598\n", + "Medium 598\n", + "High 598\n", + "Luxury 0\n", + "Name: count, dtype: int64\n", + "Выборка после undersampling (df2):\n", + "Sales_category\n", + "Low 7\n", + "Medium 7\n", + "High 7\n", + "Luxury 0\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "df2 = pd.read_csv(\"datasets_lab2/Stores.csv\")\n", + "\n", + "df2['Sales_category'] = pd.cut(df2['Store_Sales'], bins=[0, 50000, 100000, 200000, float('inf')],\n", + " labels=['Low', 'Medium', 'High', 'Luxury'])\n", + "\n", + "y2 = df2['Sales_category']\n", + "X2 = df2.drop(columns=['Store_Sales', 'Sales_category'])\n", + "\n", + "oversampler2 = RandomOverSampler(random_state=42)\n", + "X_resampled_2, y_resampled_2 = oversampler2.fit_resample(X2, y2)\n", + "\n", + "undersampler2 = RandomUnderSampler(random_state=42)\n", + "X_resampled_2_under, y_resampled_2_under = undersampler2.fit_resample(X2, y2)\n", + "\n", + "print(\"Выборка после oversampling (df2):\")\n", + "print(pd.Series(y_resampled_2).value_counts())\n", + "\n", + "print(\"Выборка после undersampling (df2):\")\n", + "print(pd.Series(y_resampled_2_under).value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Выборка после oversampling (df3):\n", + "reading_score_category\n", + "Low 903\n", + "Medium 903\n", + "High 903\n", + "Luxury 0\n", + "Name: count, dtype: int64\n", + "Выборка после undersampling (df3):\n", + "reading_score_category\n", + "Low 1\n", + "Medium 1\n", + "High 1\n", + "Luxury 0\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "df3 = pd.read_csv(\"datasets_lab2/StudentsPerformance.csv\")\n", + "\n", + "df3['reading_score_category'] = pd.cut(df3['reading score'], bins=[0, 20, 50, 100, float('inf')],\n", + " labels=['Low', 'Medium', 'High', 'Luxury'])\n", + "\n", + "y3 = df3['reading_score_category']\n", + "X3 = df3.drop(columns=['reading score', 'reading_score_category'])\n", + "\n", + "oversampler3 = RandomOverSampler(random_state=42)\n", + "X_resampled_3, y_resampled_3 = oversampler3.fit_resample(X3, y3)\n", + "\n", + "undersampler3 = RandomUnderSampler(random_state=42)\n", + "X_resampled_3_under, y_resampled_3_under = undersampler3.fit_resample(X3, y3)\n", + "\n", + "print(\"Выборка после oversampling (df3):\")\n", + "print(pd.Series(y_resampled_3).value_counts())\n", + "\n", + "print(\"Выборка после undersampling (df3):\")\n", + "print(pd.Series(y_resampled_3_under).value_counts())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lab_2/requirements.txt b/lab_2/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..a7ef9202560761420552151cc9fec5ee0494697b GIT binary patch literal 1590 zcmZvcU2@t$5QOJBRXGYNBaEFqe;&E1hoj>|+r-JyhIWD(IhSJE>S@JW$2umdej(=e-2}I(uu0`OOhv6~ zoVpWo6<-n`pIz0*Q|h3C8LPP0w-s)Ztr>C9?Fl#PS_-+*H@(b-cJE;$|8T!g4G-ye zqraKAFrjCbdjGA))jBKj6-CB<7T-zFAsu^7^fCK=;*pwO{G_NKX(R(oV66XDO@!TM z2fl$!@|exY{7Ql+VmN3%HqwZtXkaFL*UEw2H9=+b_O4U+N;hDkT#VB(+yN?DZ5kG^ zo$%rpr~CAvC}eZqeJ8|iPJX(mDtE`aa0Pcr-(CvWss{@s`0>cMtkdr=IYS>0>TpWg zfOqH{1+Ts7gyOspd4s#~awBUY)$u9?uHl^7oP%$2^ihn?$uRjelUZ=ia@F$>-l4?_ zB-u`$b3I8H$>CTVoFoDMPLr|I0l$x*=|`pCs!^)JC4H^*R#`DqMRDt_u3h}Nbo#cH z=?|F`d-wkKqMMXEm9B)TWNMlUoScK$-$o}x-k89tDiaTH&f%tU+u6cGt}r+6os~Mh z4O~Qto$$M@%~~fMoFSvVkQSK+-lhE|Lw*lBN$dhI$ua7j