From f26b7ea386b22609d1dc2b00ad45118e1487bda9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9D=D0=B8=D0=BA=D0=B8=D1=82=D0=B0=20=D0=9F=D0=BE=D1=82?=
 =?UTF-8?q?=D0=B0=D0=BF=D0=BE=D0=B2?= <ns.potapov@yandex.ru>
Date: Sat, 23 Nov 2024 18:13:22 +0400
Subject: [PATCH] =?UTF-8?q?=D0=9F=D1=80=D0=BE=D0=B2=D0=B5=D1=80=D0=B8?=
 =?UTF-8?q?=D0=BB=20=D0=B4=D0=B0=D1=82=D0=B0=D1=84=D1=80=D0=B5=D0=B9=D0=BC?=
 =?UTF-8?q?=D1=8B=20=D0=BD=D0=B0=20=D1=81=D0=BE=D0=B4=D0=B5=D1=80=D0=B6?=
 =?UTF-8?q?=D0=B0=D0=BD=D0=B8=D0=B5=20=D0=BF=D1=83=D1=81=D1=82=D1=8B=D1=85?=
 =?UTF-8?q?=20=D0=BF=D1=80=D0=B8=D0=B7=D0=BD=D0=B0=D0=BA=D0=BE=D0=B2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 lab_2/lab2.ipynb | 568 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 567 insertions(+), 1 deletion(-)
diff --git a/lab_2/lab2.ipynb b/lab_2/lab2.ipynb
index e8cf84b..f7bb449 100644
--- a/lab_2/lab2.ipynb
+++ b/lab_2/lab2.ipynb
@@ -93,6 +93,237 @@
     "  - **Целевой признак:** Зарплата в зависимости от удаленности работы и типа занятости (фиксированная сумма или разница в зарплатах для разных типов занятости)."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Выполним все необходимые импорты"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Any\n",
+    "from math import ceil\n",
+    "\n",
+    "import pandas as pd\n",
+    "from pandas import DataFrame, Series\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from imblearn.over_sampling import ADASYN\n",
+    "from imblearn.under_sampling import RandomUnderSampler\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Считаем данные для первого датасета"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 3755 entries, 0 to 3754\n",
+      "Data columns (total 11 columns):\n",
+      " #   Column              Non-Null Count  Dtype \n",
+      "---  ------              --------------  ----- \n",
+      " 0   work_year           3755 non-null   int64 \n",
+      " 1   experience_level    3755 non-null   object\n",
+      " 2   employment_type     3755 non-null   object\n",
+      " 3   job_title           3755 non-null   object\n",
+      " 4   salary              3755 non-null   int64 \n",
+      " 5   salary_currency     3755 non-null   object\n",
+      " 6   salary_in_usd       3755 non-null   int64 \n",
+      " 7   employee_residence  3755 non-null   object\n",
+      " 8   remote_ratio        3755 non-null   int64 \n",
+      " 9   company_location    3755 non-null   object\n",
+      " 10  company_size        3755 non-null   object\n",
+      "dtypes: int64(4), object(7)\n",
+      "memory usage: 322.8+ KB\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>count</th>\n",
+       "      <th>mean</th>\n",
+       "      <th>std</th>\n",
+       "      <th>min</th>\n",
+       "      <th>25%</th>\n",
+       "      <th>50%</th>\n",
+       "      <th>75%</th>\n",
+       "      <th>max</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>work_year</th>\n",
+       "      <td>3755.0</td>\n",
+       "      <td>2022.373635</td>\n",
+       "      <td>0.691448</td>\n",
+       "      <td>2020.0</td>\n",
+       "      <td>2022.0</td>\n",
+       "      <td>2022.0</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>2023.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>salary</th>\n",
+       "      <td>3755.0</td>\n",
+       "      <td>190695.571771</td>\n",
+       "      <td>671676.500508</td>\n",
+       "      <td>6000.0</td>\n",
+       "      <td>100000.0</td>\n",
+       "      <td>138000.0</td>\n",
+       "      <td>180000.0</td>\n",
+       "      <td>30400000.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>salary_in_usd</th>\n",
+       "      <td>3755.0</td>\n",
+       "      <td>137570.389880</td>\n",
+       "      <td>63055.625278</td>\n",
+       "      <td>5132.0</td>\n",
+       "      <td>95000.0</td>\n",
+       "      <td>135000.0</td>\n",
+       "      <td>175000.0</td>\n",
+       "      <td>450000.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>remote_ratio</th>\n",
+       "      <td>3755.0</td>\n",
+       "      <td>46.271638</td>\n",
+       "      <td>48.589050</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>100.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                count           mean            std     min       25%  \\\n",
+       "work_year      3755.0    2022.373635       0.691448  2020.0    2022.0   \n",
+       "salary         3755.0  190695.571771  671676.500508  6000.0  100000.0   \n",
+       "salary_in_usd  3755.0  137570.389880   63055.625278  5132.0   95000.0   \n",
+       "remote_ratio   3755.0      46.271638      48.589050     0.0       0.0   \n",
+       "\n",
+       "                    50%       75%         max  \n",
+       "work_year        2022.0    2023.0      2023.0  \n",
+       "salary         138000.0  180000.0  30400000.0  \n",
+       "salary_in_usd  135000.0  175000.0    450000.0  \n",
+       "remote_ratio        0.0     100.0       100.0  "
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.read_csv('csv/8.ds_salaries.csv')\n",
+    "df.info()\n",
+    "df.describe().transpose()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Метод проверки пустых значений в датафрейме"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Проверка пропущенных данных\n",
+    "def check_null_columns(dataframe: DataFrame) -> None:\n",
+    "    print('Присутствуют ли пустые значения признаков в колонке:')\n",
+    "    print(dataframe.isnull().any(), '\\n')\n",
+    "\n",
+    "    if any(dataframe.isnull().any()):\n",
+    "        print('Количество пустых значений признаков в колонке:')\n",
+    "        print(dataframe.isnull().sum(), '\\n')\n",
+    "\n",
+    "        print('Процент пустых значений признаков в колонке:')\n",
+    "        for column in dataframe.columns:\n",
+    "            null_rate: float = dataframe[column].isnull().sum() / len(dataframe) * 100\n",
+    "            if null_rate > 0:\n",
+    "                print(f\"{column} процент пустых значений: {null_rate:.2f}%\")            "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Проверим на пустые значения в колонках"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Присутствуют ли пустые значения признаков в колонке:\n",
+      "work_year             False\n",
+      "experience_level      False\n",
+      "employment_type       False\n",
+      "job_title             False\n",
+      "salary                False\n",
+      "salary_currency       False\n",
+      "salary_in_usd         False\n",
+      "employee_residence    False\n",
+      "remote_ratio          False\n",
+      "company_location      False\n",
+      "company_size          False\n",
+      "dtype: bool \n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "check_null_columns(df)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -192,6 +423,171 @@
     "   - **Цель**: Прогнозировать объем продаж (Store_Sales)."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 896 entries, 0 to 895\n",
+      "Data columns (total 5 columns):\n",
+      " #   Column                Non-Null Count  Dtype\n",
+      "---  ------                --------------  -----\n",
+      " 0   Store ID              896 non-null    int64\n",
+      " 1   Store_Area            896 non-null    int64\n",
+      " 2   Items_Available       896 non-null    int64\n",
+      " 3   Daily_Customer_Count  896 non-null    int64\n",
+      " 4   Store_Sales           896 non-null    int64\n",
+      "dtypes: int64(5)\n",
+      "memory usage: 35.1 KB\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>count</th>\n",
+       "      <th>mean</th>\n",
+       "      <th>std</th>\n",
+       "      <th>min</th>\n",
+       "      <th>25%</th>\n",
+       "      <th>50%</th>\n",
+       "      <th>75%</th>\n",
+       "      <th>max</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Store ID</th>\n",
+       "      <td>896.0</td>\n",
+       "      <td>448.500000</td>\n",
+       "      <td>258.797218</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>224.75</td>\n",
+       "      <td>448.5</td>\n",
+       "      <td>672.25</td>\n",
+       "      <td>896.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Store_Area</th>\n",
+       "      <td>896.0</td>\n",
+       "      <td>1485.409598</td>\n",
+       "      <td>250.237011</td>\n",
+       "      <td>775.0</td>\n",
+       "      <td>1316.75</td>\n",
+       "      <td>1477.0</td>\n",
+       "      <td>1653.50</td>\n",
+       "      <td>2229.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Items_Available</th>\n",
+       "      <td>896.0</td>\n",
+       "      <td>1782.035714</td>\n",
+       "      <td>299.872053</td>\n",
+       "      <td>932.0</td>\n",
+       "      <td>1575.50</td>\n",
+       "      <td>1773.5</td>\n",
+       "      <td>1982.75</td>\n",
+       "      <td>2667.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Daily_Customer_Count</th>\n",
+       "      <td>896.0</td>\n",
+       "      <td>786.350446</td>\n",
+       "      <td>265.389281</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>600.00</td>\n",
+       "      <td>780.0</td>\n",
+       "      <td>970.00</td>\n",
+       "      <td>1560.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Store_Sales</th>\n",
+       "      <td>896.0</td>\n",
+       "      <td>59351.305804</td>\n",
+       "      <td>17190.741895</td>\n",
+       "      <td>14920.0</td>\n",
+       "      <td>46530.00</td>\n",
+       "      <td>58605.0</td>\n",
+       "      <td>71872.50</td>\n",
+       "      <td>116320.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                      count          mean           std      min       25%  \\\n",
+       "Store ID              896.0    448.500000    258.797218      1.0    224.75   \n",
+       "Store_Area            896.0   1485.409598    250.237011    775.0   1316.75   \n",
+       "Items_Available       896.0   1782.035714    299.872053    932.0   1575.50   \n",
+       "Daily_Customer_Count  896.0    786.350446    265.389281     10.0    600.00   \n",
+       "Store_Sales           896.0  59351.305804  17190.741895  14920.0  46530.00   \n",
+       "\n",
+       "                          50%       75%       max  \n",
+       "Store ID                448.5    672.25     896.0  \n",
+       "Store_Area             1477.0   1653.50    2229.0  \n",
+       "Items_Available        1773.5   1982.75    2667.0  \n",
+       "Daily_Customer_Count    780.0    970.00    1560.0  \n",
+       "Store_Sales           58605.0  71872.50  116320.0  "
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.read_csv('csv/9.Stores.csv')\n",
+    "df.info()\n",
+    "df.describe().transpose()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Присутствуют ли пустые значения признаков в колонке:\n",
+      "Store ID                False\n",
+      "Store_Area              False\n",
+      "Items_Available         False\n",
+      "Daily_Customer_Count    False\n",
+      "Store_Sales             False\n",
+      "dtype: bool \n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "check_null_columns(df)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -265,11 +661,181 @@
     "\n",
     "Каждый из этих проектов направлен на повышение прибыльности компании, улучшение персонализированного подхода к клиентам и снижение финансовых рисков."
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 2772 entries, 0 to 2771\n",
+      "Data columns (total 7 columns):\n",
+      " #   Column    Non-Null Count  Dtype  \n",
+      "---  ------    --------------  -----  \n",
+      " 0   age       2772 non-null   int64  \n",
+      " 1   sex       2772 non-null   object \n",
+      " 2   bmi       2772 non-null   float64\n",
+      " 3   children  2772 non-null   int64  \n",
+      " 4   smoker    2772 non-null   object \n",
+      " 5   region    2772 non-null   object \n",
+      " 6   charges   2772 non-null   float64\n",
+      "dtypes: float64(2), int64(2), object(3)\n",
+      "memory usage: 151.7+ KB\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>count</th>\n",
+       "      <th>mean</th>\n",
+       "      <th>std</th>\n",
+       "      <th>min</th>\n",
+       "      <th>25%</th>\n",
+       "      <th>50%</th>\n",
+       "      <th>75%</th>\n",
+       "      <th>max</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>age</th>\n",
+       "      <td>2772.0</td>\n",
+       "      <td>39.109668</td>\n",
+       "      <td>14.081459</td>\n",
+       "      <td>18.0000</td>\n",
+       "      <td>26.000</td>\n",
+       "      <td>39.00000</td>\n",
+       "      <td>51.0000</td>\n",
+       "      <td>64.00000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>bmi</th>\n",
+       "      <td>2772.0</td>\n",
+       "      <td>30.701349</td>\n",
+       "      <td>6.129449</td>\n",
+       "      <td>15.9600</td>\n",
+       "      <td>26.220</td>\n",
+       "      <td>30.44750</td>\n",
+       "      <td>34.7700</td>\n",
+       "      <td>53.13000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>children</th>\n",
+       "      <td>2772.0</td>\n",
+       "      <td>1.101732</td>\n",
+       "      <td>1.214806</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>1.00000</td>\n",
+       "      <td>2.0000</td>\n",
+       "      <td>5.00000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>charges</th>\n",
+       "      <td>2772.0</td>\n",
+       "      <td>13261.369959</td>\n",
+       "      <td>12151.768945</td>\n",
+       "      <td>1121.8739</td>\n",
+       "      <td>4687.797</td>\n",
+       "      <td>9333.01435</td>\n",
+       "      <td>16577.7795</td>\n",
+       "      <td>63770.42801</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           count          mean           std        min       25%         50%  \\\n",
+       "age       2772.0     39.109668     14.081459    18.0000    26.000    39.00000   \n",
+       "bmi       2772.0     30.701349      6.129449    15.9600    26.220    30.44750   \n",
+       "children  2772.0      1.101732      1.214806     0.0000     0.000     1.00000   \n",
+       "charges   2772.0  13261.369959  12151.768945  1121.8739  4687.797  9333.01435   \n",
+       "\n",
+       "                 75%          max  \n",
+       "age          51.0000     64.00000  \n",
+       "bmi          34.7700     53.13000  \n",
+       "children      2.0000      5.00000  \n",
+       "charges   16577.7795  63770.42801  "
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.read_csv('csv/5.medical_insurance.csv')\n",
+    "df.info()\n",
+    "df.describe().transpose()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Присутствуют ли пустые значения признаков в колонке:\n",
+      "age         False\n",
+      "sex         False\n",
+      "bmi         False\n",
+      "children    False\n",
+      "smoker      False\n",
+      "region      False\n",
+      "charges     False\n",
+      "dtype: bool \n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "check_null_columns(df)"
+   ]
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.6"
   }
  },
  "nbformat": 4,

	count	mean	std	min	25%	50%	75%	max
work_year	3755.0	2022.373635	0.691448	2020.0	2022.0	2022.0	2023.0	2023.0
salary	3755.0	190695.571771	671676.500508	6000.0	100000.0	138000.0	180000.0	30400000.0
salary_in_usd	3755.0	137570.389880	63055.625278	5132.0	95000.0	135000.0	175000.0	450000.0
remote_ratio	3755.0	46.271638	48.589050	0.0	0.0	0.0	100.0	100.0
	count	mean	std	min	25%	50%	75%	max
Store ID	896.0	448.500000	258.797218	1.0	224.75	448.5	672.25	896.0
Store_Area	896.0	1485.409598	250.237011	775.0	1316.75	1477.0	1653.50	2229.0
Items_Available	896.0	1782.035714	299.872053	932.0	1575.50	1773.5	1982.75	2667.0
Daily_Customer_Count	896.0	786.350446	265.389281	10.0	600.00	780.0	970.00	1560.0
Store_Sales	896.0	59351.305804	17190.741895	14920.0	46530.00	58605.0	71872.50	116320.0
	count	mean	std	min	25%	50%	75%	max
age	2772.0	39.109668	14.081459	18.0000	26.000	39.00000	51.0000	64.00000
bmi	2772.0	30.701349	6.129449	15.9600	26.220	30.44750	34.7700	53.13000
children	2772.0	1.101732	1.214806	0.0000	0.000	1.00000	2.0000	5.00000
charges	2772.0	13261.369959	12151.768945	1121.8739	4687.797	9333.01435	16577.7795	63770.42801