From 792fbe75b9fa8a02bf0c7fbcc8b7eda69bea1ef7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=A2=D0=B0=D0=B1=D0=B5=D0=B5=D0=B2=20=D0=90=D0=BB=D0=B5?=
 =?UTF-8?q?=D0=BA=D1=81=D0=B0=D0=BD=D0=B4=D1=80?=
 <tabeev2017gmail.com@yandex.ru>
Date: Fri, 1 Nov 2024 20:29:45 +0400
Subject: [PATCH 1/2] upd gitignore

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)
diff --git a/.gitignore b/.gitignore
index aa5d4b2..b8dd8eb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -174,3 +174,6 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
+lab_2/datasets_lab2/coffee.csv
+lab_2/datasets_lab2/Stores.csv
+lab_2/datasets_lab2/StudentsPerformance.csv
-- 
2.25.1


From 9b2346eabcc54f2b381b356f3c0d91df2f8a4ed8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=A2=D0=B0=D0=B1=D0=B5=D0=B5=D0=B2=20=D0=90=D0=BB=D0=B5?=
 =?UTF-8?q?=D0=BA=D1=81=D0=B0=D0=BD=D0=B4=D1=80?=
 <tabeev2017gmail.com@yandex.ru>
Date: Fri, 1 Nov 2024 20:30:19 +0400
Subject: [PATCH 2/2] lab 2 is done

---
 README.md              |   1 +
 lab_1/lab1.ipynb       |   2 +-
 lab_2/lab2.ipynb       | 700 +++++++++++++++++++++++++++++++++++++++++
 lab_2/requirements.txt | Bin 0 -> 1590 bytes
 4 files changed, 702 insertions(+), 1 deletion(-)
 create mode 100644 lab_2/lab2.ipynb
 create mode 100644 lab_2/requirements.txt

diff --git a/README.md b/README.md
index 066bf1d..b2de7ea 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,4 @@
 # AIM_PIbd-31_Tabeev_A.P
+# Табеев Александр Павлович
 # Вариант 3
 # https://clck.yandex.ru/redir/nWO_r1F33ck?data=NnBZTWRhdFZKOHRaTENSMFc4S0VQUGMtSXk0bDRzNnVpakFkYjNNRE5ZNFRuVk4yRGpQaHlFSHNPNVpscDY1RGxPdmF0UFlUU3V4cmpoNDBvcE5vQVAxUzRxUzFpU1YzejluV1ozdUpoS1ZMeWdjcHktYS1IT2diWFhTLWpZcVhkdzF4a25GUGRYZGtSQ3RsclBRV1RiTWJsdFlyNVFIV1MyVHp0NDJTY21Z&b64e=2&sign=0e97a68a5fb67b83ca9ea592b182bbae&keyno=17
\ No newline at end of file
diff --git a/lab_1/lab1.ipynb b/lab_1/lab1.ipynb
index c022d05..3c36ac9 100644
--- a/lab_1/lab1.ipynb
+++ b/lab_1/lab1.ipynb
@@ -11,7 +11,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
diff --git a/lab_2/lab2.ipynb b/lab_2/lab2.ipynb
new file mode 100644
index 0000000..1d34333
--- /dev/null
+++ b/lab_2/lab2.ipynb
@@ -0,0 +1,700 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Загрузка датасетов и вывод информации"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 8036 entries, 0 to 8035\n",
+      "Data columns (total 7 columns):\n",
+      " #   Column     Non-Null Count  Dtype  \n",
+      "---  ------     --------------  -----  \n",
+      " 0   Date       8036 non-null   object \n",
+      " 1   Open       8036 non-null   float64\n",
+      " 2   High       8036 non-null   float64\n",
+      " 3   Low        8036 non-null   float64\n",
+      " 4   Close      8036 non-null   float64\n",
+      " 5   Adj Close  8036 non-null   float64\n",
+      " 6   Volume     8036 non-null   int64  \n",
+      "dtypes: float64(5), int64(1), object(1)\n",
+      "memory usage: 439.6+ KB\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 896 entries, 0 to 895\n",
+      "Data columns (total 5 columns):\n",
+      " #   Column                Non-Null Count  Dtype\n",
+      "---  ------                --------------  -----\n",
+      " 0   Store ID              896 non-null    int64\n",
+      " 1   Store_Area            896 non-null    int64\n",
+      " 2   Items_Available       896 non-null    int64\n",
+      " 3   Daily_Customer_Count  896 non-null    int64\n",
+      " 4   Store_Sales           896 non-null    int64\n",
+      "dtypes: int64(5)\n",
+      "memory usage: 35.1 KB\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 1000 entries, 0 to 999\n",
+      "Data columns (total 8 columns):\n",
+      " #   Column                       Non-Null Count  Dtype \n",
+      "---  ------                       --------------  ----- \n",
+      " 0   gender                       1000 non-null   object\n",
+      " 1   race/ethnicity               1000 non-null   object\n",
+      " 2   parental level of education  1000 non-null   object\n",
+      " 3   lunch                        1000 non-null   object\n",
+      " 4   test preparation course      1000 non-null   object\n",
+      " 5   math score                   1000 non-null   int64 \n",
+      " 6   reading score                1000 non-null   int64 \n",
+      " 7   writing score                1000 non-null   int64 \n",
+      "dtypes: int64(3), object(5)\n",
+      "memory usage: 62.6+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "df = pd.read_csv(\"datasets_lab2/coffee.csv\")\n",
+    "df2 = pd.read_csv(\"datasets_lab2/Stores.csv\")\n",
+    "df3 = pd.read_csv(\"datasets_lab2/StudentsPerformance.csv\")\n",
+    "df.info()\n",
+    "df2.info()\n",
+    "df3.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Проблемная область"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Первый датасет coffee.csv позволяет спрогнозировать будущие показатели акций кофейни Starbucks. Второй датасет Stores.csv - магазины. Третий датасет StudentsPerformance.csv - успеваемость студентов на экзамене"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Анализ набора данных"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Объекты: \n",
+    "1. Акции\n",
+    "2. Магазины\n",
+    "3. Успеваемость студентов\n",
+    "Атрибуты: \n",
+    "1. Дата; начальная цена за день; максимальная цена; минимальная цена; цена на момент закрытия продаж; скорректированая цена на момент закрытия; объем торговли акций за день.\n",
+    "2. Идентификатор магазина; физическая площадь; кол-во доступных товаров; количество покупателей, посетивших магазины в среднем за месяц; продажив магазинах (в долларах США).\n",
+    "3. Пол, раса/этническая принадлежность, уровень образования родителей, обед, курс подготовки к тестированию, оценка по математике, оценка по чтению, оценка по письму."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Бизнес цели"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1. Анализ показателей акций Starbucks — прогнозирование будущей стоимости акций для улучшения финансовых решений, предотвращения резких убытков и повышения прибыли.\n",
+    "2. Оценка эффективности магазинов — понимание факторов, влияющих на успешность магазинов, включая выручку и посещаемость, что поможет принимать решения о возможном расширении сети или оптимизации текущей стратегии размещения.\n",
+    "3. Анализ образовательной успеваемости студентов — выявление факторов, влияющих на академические успехи для разработки программ поддержки студентов."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Примеры целей технического проекта. Что поступает на вход, что является целевым признаком."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "На входе будут переданы следующие датасеты, целевые признаки для каждого:\n",
+    "1. coffee.csv — максимальная цена акций за день.\n",
+    "2. Stores.csv — выручка магазина.\n",
+    "3. StudentsPerformance.csv — общий средний балл (среднее значение по математике, чтению и письму)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Проблемы набора данных и их решения"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1. Устаревшие данные: Важно проверять актуальность данных, особенно для финансовых временных рядов и данных о магазинах. Для этого старые записи могут быть удалены или обновлены, если доступны более свежие данные.\n",
+    "2. Выбросы: Необходимо выявить аномалии, такие как резкие изменения в ценах акций или посещаемости, и принять решение об их удалении или сглаживании, исходя из размера выборки."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Качество набора данных"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Наборы данных содержат достаточно примеров и признаков для обучения модели. Учтены различные ситуации проблемной области. Данные соответствуют данным, которые будут подаваться в производственной среде. Все метки согласованы."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Поиск аномалий"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "              Open         High          Low        Close    Adj Close  \\\n",
+      "count  8036.000000  8036.000000  8036.000000  8036.000000  8036.000000   \n",
+      "mean     30.054280    30.351487    29.751322    30.058857    26.674025   \n",
+      "std      33.615577    33.906613    33.314569    33.615911    31.728090   \n",
+      "min       0.328125     0.347656     0.320313     0.335938     0.260703   \n",
+      "25%       4.392031     4.531250     4.304922     4.399610     3.414300   \n",
+      "50%      13.325000    13.493750    13.150000    13.330000    10.352452   \n",
+      "75%      55.250000    55.722501    54.852499    55.267499    47.464829   \n",
+      "max     126.080002   126.320000   124.809998   126.059998   118.010414   \n",
+      "\n",
+      "             Volume  \n",
+      "count  8.036000e+03  \n",
+      "mean   1.470459e+07  \n",
+      "std    1.340021e+07  \n",
+      "min    1.504000e+06  \n",
+      "25%    7.817750e+06  \n",
+      "50%    1.169815e+07  \n",
+      "75%    1.778795e+07  \n",
+      "max    5.855088e+08  \n",
+      "        Store ID    Store_Area  Items_Available  Daily_Customer_Count  \\\n",
+      "count  896.000000   896.000000       896.000000            896.000000   \n",
+      "mean   448.500000  1485.409598      1782.035714            786.350446   \n",
+      "std    258.797218   250.237011       299.872053            265.389281   \n",
+      "min      1.000000   775.000000       932.000000             10.000000   \n",
+      "25%    224.750000  1316.750000      1575.500000            600.000000   \n",
+      "50%    448.500000  1477.000000      1773.500000            780.000000   \n",
+      "75%    672.250000  1653.500000      1982.750000            970.000000   \n",
+      "max    896.000000  2229.000000      2667.000000           1560.000000   \n",
+      "\n",
+      "         Store_Sales  \n",
+      "count     896.000000  \n",
+      "mean    59351.305804  \n",
+      "std     17190.741895  \n",
+      "min     14920.000000  \n",
+      "25%     46530.000000  \n",
+      "50%     58605.000000  \n",
+      "75%     71872.500000  \n",
+      "max    116320.000000  \n",
+      "       math score  reading score  writing score\n",
+      "count  1000.00000    1000.000000    1000.000000\n",
+      "mean     66.08900      69.169000      68.054000\n",
+      "std      15.16308      14.600192      15.195657\n",
+      "min       0.00000      17.000000      10.000000\n",
+      "25%      57.00000      59.000000      57.750000\n",
+      "50%      66.00000      70.000000      69.000000\n",
+      "75%      77.00000      79.000000      79.000000\n",
+      "max     100.00000     100.000000     100.000000\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(df.describe())\n",
+    "print(df2.describe())\n",
+    "print(df3.describe())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "При просмотре вывода не было замечено аномалий в столбцах датасетов."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Проблема пропущенных данных"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Первый датасет coffee.csv\n",
+      "Второй датасет Stores.csv\n",
+      "Третий датасет StudentsPerformance.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Первый датасет coffee.csv\")\n",
+    "for i in df.columns:\n",
+    "    null_rate = df[i].isnull().sum() / len(df)*100\n",
+    "    if null_rate > 0:\n",
+    "        print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n",
+    "print(\"Второй датасет Stores.csv\")\n",
+    "for i in df2.columns:\n",
+    "    null_rate = df2[i].isnull().sum() / len(df2)*100\n",
+    "    if null_rate > 0:\n",
+    "        print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n",
+    "print(\"Третий датасет StudentsPerformance.csv\")\n",
+    "for i in df3.columns:\n",
+    "    null_rate = df3[i].isnull().sum() / len(df3)*100\n",
+    "    if null_rate > 0:\n",
+    "        print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Во всех трех датасетах пустых значений не найдено, по наполненности датасеты удовлетворительны. Но можно добавить побольше столбцов в датасет магазинов."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "В первом датасете coffee.csv удалим строки, где год ниже 2000."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "           Date       Open       High        Low      Close  Adj Close  \\\n",
+      "1899 2000-01-03   2.984375   3.085938   2.906250   3.082031   2.391797   \n",
+      "1900 2000-01-04   3.007813   3.109375   2.968750   2.984375   2.316012   \n",
+      "1901 2000-01-05   2.992188   3.078125   2.960938   3.023438   2.346326   \n",
+      "1902 2000-01-06   3.000000   3.203125   3.000000   3.132813   2.431207   \n",
+      "1903 2000-01-07   3.093750   3.125000   3.031250   3.117188   2.419082   \n",
+      "...         ...        ...        ...        ...        ...        ...   \n",
+      "8031 2024-05-17  75.269997  78.000000  74.919998  77.849998  77.849998   \n",
+      "8032 2024-05-20  77.680000  78.320000  76.709999  77.540001  77.540001   \n",
+      "8033 2024-05-21  77.559998  78.220001  77.500000  77.720001  77.720001   \n",
+      "8034 2024-05-22  77.699997  81.019997  77.440002  80.720001  80.720001   \n",
+      "8035 2024-05-23  80.099998  80.699997  79.169998  79.260002  79.260002   \n",
+      "\n",
+      "        Volume  \n",
+      "1899  24232000  \n",
+      "1900  21564800  \n",
+      "1901  28206400  \n",
+      "1902  30825600  \n",
+      "1903  26044800  \n",
+      "...        ...  \n",
+      "8031  14436500  \n",
+      "8032  11183800  \n",
+      "8033   8916600  \n",
+      "8034  22063400  \n",
+      "8035   4651418  \n",
+      "\n",
+      "[6137 rows x 7 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "df_filtered = df[df['Date'] > '2000-01-01']\n",
+    "print(df_filtered)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Во втором датасете Stores.csv всем магазинам поставим среднее значение площади."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "     Store ID   Store_Area  Items_Available  Daily_Customer_Count  Store_Sales\n",
+      "0            1      1500.0             1961                   530        66490\n",
+      "1            2      1500.0             1752                   210        39820\n",
+      "2            3      1500.0             1609                   720        54010\n",
+      "3            4      1500.0             1748                   620        53730\n",
+      "4            5      1500.0             2111                   450        46620\n",
+      "..         ...         ...              ...                   ...          ...\n",
+      "891        892      1500.0             1910                  1080        66390\n",
+      "892        893      1500.0             1663                   850        82080\n",
+      "893        894      1500.0             1436                  1060        76440\n",
+      "894        895      1500.0             1560                   770        96610\n",
+      "895        896      1500.0             1429                  1110        54340\n",
+      "\n",
+      "[896 rows x 5 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "store_area_mean = df2['Store_Area'].mean()\n",
+    "df2['Store_Area'] = store_area_mean\n",
+    "print(df2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "В третьем датасете StudentsPerformance.csv всем студентам сделаем ланч стандартным."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "     gender race/ethnicity parental level of education     lunch  \\\n",
+      "0    female        group B           bachelor's degree  standard   \n",
+      "1    female        group C                some college  standard   \n",
+      "2    female        group B             master's degree  standard   \n",
+      "3      male        group A          associate's degree  standard   \n",
+      "4      male        group C                some college  standard   \n",
+      "..      ...            ...                         ...       ...   \n",
+      "995  female        group E             master's degree  standard   \n",
+      "996    male        group C                 high school  standard   \n",
+      "997  female        group C                 high school  standard   \n",
+      "998  female        group D                some college  standard   \n",
+      "999  female        group D                some college  standard   \n",
+      "\n",
+      "    test preparation course  math score  reading score  writing score  \n",
+      "0                      none          72             72             74  \n",
+      "1                 completed          69             90             88  \n",
+      "2                      none          90             95             93  \n",
+      "3                      none          47             57             44  \n",
+      "4                      none          76             78             75  \n",
+      "..                      ...         ...            ...            ...  \n",
+      "995               completed          88             99             95  \n",
+      "996                    none          62             55             55  \n",
+      "997               completed          59             71             65  \n",
+      "998               completed          68             78             77  \n",
+      "999                    none          77             86             86  \n",
+      "\n",
+      "[1000 rows x 8 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "df3['lunch'] = 'standard'\n",
+    "print(df3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "def split_stratified_into_train_val_test(\n",
+    "    df_input,\n",
+    "    stratify_colname=\"y\",\n",
+    "    frac_train=0.6,\n",
+    "    frac_val=0.15,\n",
+    "    frac_test=0.25,\n",
+    "    random_state=None,\n",
+    "):\n",
+    "    if frac_train + frac_val + frac_test != 1.0:\n",
+    "        raise ValueError(\n",
+    "            \"fractions %f, %f, %f do not add up to 1.0\"\n",
+    "            % (frac_train, frac_val, frac_test)\n",
+    "        )\n",
+    "\n",
+    "    if stratify_colname not in df_input.columns:\n",
+    "        raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
+    "\n",
+    "    X = df_input  # Contains all columns.\n",
+    "    y = df_input[\n",
+    "        [stratify_colname]\n",
+    "    ]  # Dataframe of just the column on which to stratify.\n",
+    "\n",
+    "    # Split original dataframe into train and temp dataframes.\n",
+    "    df_train, df_temp, y_train, y_temp = train_test_split(\n",
+    "        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
+    "    )\n",
+    "\n",
+    "    # Split the temp dataframe into val and test dataframes.\n",
+    "    relative_frac_test = frac_test / (frac_val + frac_test)\n",
+    "    df_val, df_test, y_val, y_test = train_test_split(\n",
+    "        df_temp,\n",
+    "        y_temp,\n",
+    "        stratify=y_temp,\n",
+    "        test_size=relative_frac_test,\n",
+    "        random_state=random_state,\n",
+    "    )\n",
+    "\n",
+    "    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
+    "\n",
+    "    return df_train, df_val, df_test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train df: (4821, 7), Validation df: (1607, 7), Test df: (1608, 7)\n",
+      "Train df2: (537, 5), Validation df2: (179, 5), Test df2: (180, 5)\n",
+      "Train df3: (600, 8), Validation df3: (200, 8), Test df3: (200, 8)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)\n",
+    "val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42) \n",
+    "\n",
+    "train_df2, temp_df2 = train_test_split(df2, test_size=0.4, random_state=42)\n",
+    "val_df2, test_df2 = train_test_split(temp_df2, test_size=0.5, random_state=42)\n",
+    "\n",
+    "train_df3, temp_df3 = train_test_split(df3, test_size=0.4, random_state=42)\n",
+    "val_df3, test_df3 = train_test_split(temp_df3, test_size=0.5, random_state=42)\n",
+    "print(f\"Train df: {train_df.shape}, Validation df: {val_df.shape}, Test df: {test_df.shape}\")\n",
+    "print(f\"Train df2: {train_df2.shape}, Validation df2: {val_df2.shape}, Test df2: {test_df2.shape}\")\n",
+    "print(f\"Train df3: {train_df3.shape}, Validation df3: {val_df3.shape}, Test df3: {test_df3.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Было сделано разбиение на три выборки: 60%, 20% и 20% при помощи библиотеки scikit-learn и функции train_test_split. На взгляд сбалансированные"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Приращение методами выборки с избытком (oversampling) и выборки с недостатком (undersampling)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Выборка после oversampling (df):\n",
+      "High_category\n",
+      "Low       4835\n",
+      "Medium    4835\n",
+      "High      4835\n",
+      "Luxury    4835\n",
+      "Name: count, dtype: int64\n",
+      "Выборка после undersampling (df):\n",
+      "High_category\n",
+      "Low       321\n",
+      "Medium    321\n",
+      "High      321\n",
+      "Luxury    321\n",
+      "Name: count, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "from imblearn.over_sampling import RandomOverSampler\n",
+    "from imblearn.under_sampling import RandomUnderSampler\n",
+    "\n",
+    "df = pd.read_csv(\"datasets_lab2/coffee.csv\")\n",
+    "df['High_category'] = pd.cut(df['High'], bins=[0.1, 20, 60, 105, float('inf')],\n",
+    "                              labels=['Low', 'Medium', 'High', 'Luxury'])\n",
+    "\n",
+    "y = df['High_category']\n",
+    "X = df.drop(columns=['High', 'High_category'])\n",
+    "\n",
+    "oversampler = RandomOverSampler(random_state=42)\n",
+    "X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
+    "\n",
+    "undersampler = RandomUnderSampler(random_state=42)\n",
+    "X_resampled_under, y_resampled_under = undersampler.fit_resample(X, y)\n",
+    "\n",
+    "print(\"Выборка после oversampling (df):\")\n",
+    "print(pd.Series(y_resampled).value_counts())\n",
+    "\n",
+    "print(\"Выборка после undersampling (df):\")\n",
+    "print(pd.Series(y_resampled_under).value_counts())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Выборка после oversampling (df2):\n",
+      "Sales_category\n",
+      "Low       598\n",
+      "Medium    598\n",
+      "High      598\n",
+      "Luxury      0\n",
+      "Name: count, dtype: int64\n",
+      "Выборка после undersampling (df2):\n",
+      "Sales_category\n",
+      "Low       7\n",
+      "Medium    7\n",
+      "High      7\n",
+      "Luxury    0\n",
+      "Name: count, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "df2 = pd.read_csv(\"datasets_lab2/Stores.csv\")\n",
+    "\n",
+    "df2['Sales_category'] = pd.cut(df2['Store_Sales'], bins=[0, 50000, 100000, 200000, float('inf')],\n",
+    "                                labels=['Low', 'Medium', 'High', 'Luxury'])\n",
+    "\n",
+    "y2 = df2['Sales_category']\n",
+    "X2 = df2.drop(columns=['Store_Sales', 'Sales_category'])\n",
+    "\n",
+    "oversampler2 = RandomOverSampler(random_state=42)\n",
+    "X_resampled_2, y_resampled_2 = oversampler2.fit_resample(X2, y2)\n",
+    "\n",
+    "undersampler2 = RandomUnderSampler(random_state=42)\n",
+    "X_resampled_2_under, y_resampled_2_under = undersampler2.fit_resample(X2, y2)\n",
+    "\n",
+    "print(\"Выборка после oversampling (df2):\")\n",
+    "print(pd.Series(y_resampled_2).value_counts())\n",
+    "\n",
+    "print(\"Выборка после undersampling (df2):\")\n",
+    "print(pd.Series(y_resampled_2_under).value_counts())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Выборка после oversampling (df3):\n",
+      "reading_score_category\n",
+      "Low       903\n",
+      "Medium    903\n",
+      "High      903\n",
+      "Luxury      0\n",
+      "Name: count, dtype: int64\n",
+      "Выборка после undersampling (df3):\n",
+      "reading_score_category\n",
+      "Low       1\n",
+      "Medium    1\n",
+      "High      1\n",
+      "Luxury    0\n",
+      "Name: count, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "df3 = pd.read_csv(\"datasets_lab2/StudentsPerformance.csv\")\n",
+    "\n",
+    "df3['reading_score_category'] = pd.cut(df3['reading score'], bins=[0, 20, 50, 100, float('inf')],\n",
+    "                              labels=['Low', 'Medium', 'High', 'Luxury'])\n",
+    "\n",
+    "y3 = df3['reading_score_category']\n",
+    "X3 = df3.drop(columns=['reading score', 'reading_score_category'])\n",
+    "\n",
+    "oversampler3 = RandomOverSampler(random_state=42)\n",
+    "X_resampled_3, y_resampled_3 = oversampler3.fit_resample(X3, y3)\n",
+    "\n",
+    "undersampler3 = RandomUnderSampler(random_state=42)\n",
+    "X_resampled_3_under, y_resampled_3_under = undersampler3.fit_resample(X3, y3)\n",
+    "\n",
+    "print(\"Выборка после oversampling (df3):\")\n",
+    "print(pd.Series(y_resampled_3).value_counts())\n",
+    "\n",
+    "print(\"Выборка после undersampling (df3):\")\n",
+    "print(pd.Series(y_resampled_3_under).value_counts())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/lab_2/requirements.txt b/lab_2/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a7ef9202560761420552151cc9fec5ee0494697b
GIT binary patch
literal 1590
zcmZvcU2@t$5QOJBRXGYNBaEFq<PN!kiyy@n23Y}NOb(x<ziAdJPKpv`voqa2J-zz<
zYY~U&^?8ZssH4??U-hZtQ{3vA#>e;&E1hoj>|+r-JyhIWD(<coJDrsi`m5_noA^id
zcX7^|sm^D5ZZ0)vacsrEzamUigZbAwS;jFQ|LXs&{(1Iujz;t`iBaY{hsPLoyk*r<
z6^-V6OeS<vV;$esi&ik0*k0hneB?%>IhSJE>S@JW$2umdej(=e-2}I(uu0`OOhv6~
zoVpWo6<-n`pIz0*Q|h3C8LPP0w-s)Ztr>C9?Fl#PS_-+*H@(b-cJE;$|8T!g4G-ye
zqraKAFrjCbdjGA))jBKj6-CB<7T-zFAsu^7^fCK=;*pwO{G_NKX(R(oV66XDO@!TM
z2fl$!@|exY{7Ql+VmN3%HqwZtXkaFL*UEw2H9=+b_O4U+N;hDkT#VB(+yN?DZ5kG^
zo$%rpr~CAvC}eZqeJ8|iPJX(mDtE`aa0Pcr-(CvWss{@s`0>cMtkdr=IYS>0>TpWg
zfOqH{1+Ts7gyOspd4s#~awBUY)$u9?uHl^7oP%$2^ihn?$uRjelUZ=ia@F$>-l4?_
zB-u`$b3I8H$>CTVoFoDMPLr|I0l$x*=|`pCs!^)JC4H^*R#`DqMRDt_u3h}Nbo#cH
z=?|F`d-wkKqMMXEm9B)TWNMlUoScK$-$o}x-k89tDiaTH&f%tU+u6cGt}r+6os~Mh
z4O~Qto$$M@%~~fMoFSvVkQSK+-lhE|Lw*lBN$dhI$ua7j<l?C(I*A`o*Q$8Rijs-x
HB^7@F+h*u?

literal 0
HcmV?d00001

-- 
2.25.1