Проверил датафреймы на содержание пустых признаков

2024-11-23 18:13:22 +04:00 · 2024-11-23 18:13:22 +04:00 · f26b7ea386
commit f26b7ea386
parent 87062d0743
1 changed files with 567 additions and 1 deletions
--- a/lab_2/lab2.ipynb
+++ b/lab_2/lab2.ipynb
@ -93,6 +93,237 @@
    "  - **Целевой признак:** Зарплата в зависимости от удаленности работы и типа занятости (фиксированная сумма или разница в зарплатах для разных типов занятости)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Выполним все необходимые импорты"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Any\n",
    "from math import ceil\n",
    "\n",
    "import pandas as pd\n",
    "from pandas import DataFrame, Series\n",
    "from sklearn.model_selection import train_test_split\n",
    "from imblearn.over_sampling import ADASYN\n",
    "from imblearn.under_sampling import RandomUnderSampler\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Считаем данные для первого датасета"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 3755 entries, 0 to 3754\n",
      "Data columns (total 11 columns):\n",
      " #   Column              Non-Null Count  Dtype \n",
      "---  ------              --------------  ----- \n",
      " 0   work_year           3755 non-null   int64 \n",
      " 1   experience_level    3755 non-null   object\n",
      " 2   employment_type     3755 non-null   object\n",
      " 3   job_title           3755 non-null   object\n",
      " 4   salary              3755 non-null   int64 \n",
      " 5   salary_currency     3755 non-null   object\n",
      " 6   salary_in_usd       3755 non-null   int64 \n",
      " 7   employee_residence  3755 non-null   object\n",
      " 8   remote_ratio        3755 non-null   int64 \n",
      " 9   company_location    3755 non-null   object\n",
      " 10  company_size        3755 non-null   object\n",
      "dtypes: int64(4), object(7)\n",
      "memory usage: 322.8+ KB\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>25%</th>\n",
       "      <th>50%</th>\n",
       "      <th>75%</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>work_year</th>\n",
       "      <td>3755.0</td>\n",
       "      <td>2022.373635</td>\n",
       "      <td>0.691448</td>\n",
       "      <td>2020.0</td>\n",
       "      <td>2022.0</td>\n",
       "      <td>2022.0</td>\n",
       "      <td>2023.0</td>\n",
       "      <td>2023.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>salary</th>\n",
       "      <td>3755.0</td>\n",
       "      <td>190695.571771</td>\n",
       "      <td>671676.500508</td>\n",
       "      <td>6000.0</td>\n",
       "      <td>100000.0</td>\n",
       "      <td>138000.0</td>\n",
       "      <td>180000.0</td>\n",
       "      <td>30400000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>salary_in_usd</th>\n",
       "      <td>3755.0</td>\n",
       "      <td>137570.389880</td>\n",
       "      <td>63055.625278</td>\n",
       "      <td>5132.0</td>\n",
       "      <td>95000.0</td>\n",
       "      <td>135000.0</td>\n",
       "      <td>175000.0</td>\n",
       "      <td>450000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>remote_ratio</th>\n",
       "      <td>3755.0</td>\n",
       "      <td>46.271638</td>\n",
       "      <td>48.589050</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>100.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                count           mean            std     min       25%  \\\n",
       "work_year      3755.0    2022.373635       0.691448  2020.0    2022.0   \n",
       "salary         3755.0  190695.571771  671676.500508  6000.0  100000.0   \n",
       "salary_in_usd  3755.0  137570.389880   63055.625278  5132.0   95000.0   \n",
       "remote_ratio   3755.0      46.271638      48.589050     0.0       0.0   \n",
       "\n",
       "                    50%       75%         max  \n",
       "work_year        2022.0    2023.0      2023.0  \n",
       "salary         138000.0  180000.0  30400000.0  \n",
       "salary_in_usd  135000.0  175000.0    450000.0  \n",
       "remote_ratio        0.0     100.0       100.0  "
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('csv/8.ds_salaries.csv')\n",
    "df.info()\n",
    "df.describe().transpose()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Метод проверки пустых значений в датафрейме"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Проверка пропущенных данных\n",
    "def check_null_columns(dataframe: DataFrame) -> None:\n",
    "    print('Присутствуют ли пустые значения признаков в колонке:')\n",
    "    print(dataframe.isnull().any(), '\\n')\n",
    "\n",
    "    if any(dataframe.isnull().any()):\n",
    "        print('Количество пустых значений признаков в колонке:')\n",
    "        print(dataframe.isnull().sum(), '\\n')\n",
    "\n",
    "        print('Процент пустых значений признаков в колонке:')\n",
    "        for column in dataframe.columns:\n",
    "            null_rate: float = dataframe[column].isnull().sum() / len(dataframe) * 100\n",
    "            if null_rate > 0:\n",
    "                print(f\"{column} процент пустых значений: {null_rate:.2f}%\")            "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Проверим на пустые значения в колонках"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Присутствуют ли пустые значения признаков в колонке:\n",
      "work_year             False\n",
      "experience_level      False\n",
      "employment_type       False\n",
      "job_title             False\n",
      "salary                False\n",
      "salary_currency       False\n",
      "salary_in_usd         False\n",
      "employee_residence    False\n",
      "remote_ratio          False\n",
      "company_location      False\n",
      "company_size          False\n",
      "dtype: bool \n",
      "\n"
     ]
    }
   ],
   "source": [
    "check_null_columns(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -192,6 +423,171 @@
    "   - **Цель**: Прогнозировать объем продаж (Store_Sales)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 896 entries, 0 to 895\n",
      "Data columns (total 5 columns):\n",
      " #   Column                Non-Null Count  Dtype\n",
      "---  ------                --------------  -----\n",
      " 0   Store ID              896 non-null    int64\n",
      " 1   Store_Area            896 non-null    int64\n",
      " 2   Items_Available       896 non-null    int64\n",
      " 3   Daily_Customer_Count  896 non-null    int64\n",
      " 4   Store_Sales           896 non-null    int64\n",
      "dtypes: int64(5)\n",
      "memory usage: 35.1 KB\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>25%</th>\n",
       "      <th>50%</th>\n",
       "      <th>75%</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Store ID</th>\n",
       "      <td>896.0</td>\n",
       "      <td>448.500000</td>\n",
       "      <td>258.797218</td>\n",
       "      <td>1.0</td>\n",
       "      <td>224.75</td>\n",
       "      <td>448.5</td>\n",
       "      <td>672.25</td>\n",
       "      <td>896.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Store_Area</th>\n",
       "      <td>896.0</td>\n",
       "      <td>1485.409598</td>\n",
       "      <td>250.237011</td>\n",
       "      <td>775.0</td>\n",
       "      <td>1316.75</td>\n",
       "      <td>1477.0</td>\n",
       "      <td>1653.50</td>\n",
       "      <td>2229.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Items_Available</th>\n",
       "      <td>896.0</td>\n",
       "      <td>1782.035714</td>\n",
       "      <td>299.872053</td>\n",
       "      <td>932.0</td>\n",
       "      <td>1575.50</td>\n",
       "      <td>1773.5</td>\n",
       "      <td>1982.75</td>\n",
       "      <td>2667.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Daily_Customer_Count</th>\n",
       "      <td>896.0</td>\n",
       "      <td>786.350446</td>\n",
       "      <td>265.389281</td>\n",
       "      <td>10.0</td>\n",
       "      <td>600.00</td>\n",
       "      <td>780.0</td>\n",
       "      <td>970.00</td>\n",
       "      <td>1560.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Store_Sales</th>\n",
       "      <td>896.0</td>\n",
       "      <td>59351.305804</td>\n",
       "      <td>17190.741895</td>\n",
       "      <td>14920.0</td>\n",
       "      <td>46530.00</td>\n",
       "      <td>58605.0</td>\n",
       "      <td>71872.50</td>\n",
       "      <td>116320.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      count          mean           std      min       25%  \\\n",
       "Store ID              896.0    448.500000    258.797218      1.0    224.75   \n",
       "Store_Area            896.0   1485.409598    250.237011    775.0   1316.75   \n",
       "Items_Available       896.0   1782.035714    299.872053    932.0   1575.50   \n",
       "Daily_Customer_Count  896.0    786.350446    265.389281     10.0    600.00   \n",
       "Store_Sales           896.0  59351.305804  17190.741895  14920.0  46530.00   \n",
       "\n",
       "                          50%       75%       max  \n",
       "Store ID                448.5    672.25     896.0  \n",
       "Store_Area             1477.0   1653.50    2229.0  \n",
       "Items_Available        1773.5   1982.75    2667.0  \n",
       "Daily_Customer_Count    780.0    970.00    1560.0  \n",
       "Store_Sales           58605.0  71872.50  116320.0  "
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('csv/9.Stores.csv')\n",
    "df.info()\n",
    "df.describe().transpose()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Присутствуют ли пустые значения признаков в колонке:\n",
      "Store ID                False\n",
      "Store_Area              False\n",
      "Items_Available         False\n",
      "Daily_Customer_Count    False\n",
      "Store_Sales             False\n",
      "dtype: bool \n",
      "\n"
     ]
    }
   ],
   "source": [
    "check_null_columns(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -265,11 +661,181 @@
    "\n",
    "Каждый из этих проектов направлен на повышение прибыльности компании, улучшение персонализированного подхода к клиентам и снижение финансовых рисков."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 2772 entries, 0 to 2771\n",
      "Data columns (total 7 columns):\n",
      " #   Column    Non-Null Count  Dtype  \n",
      "---  ------    --------------  -----  \n",
      " 0   age       2772 non-null   int64  \n",
      " 1   sex       2772 non-null   object \n",
      " 2   bmi       2772 non-null   float64\n",
      " 3   children  2772 non-null   int64  \n",
      " 4   smoker    2772 non-null   object \n",
      " 5   region    2772 non-null   object \n",
      " 6   charges   2772 non-null   float64\n",
      "dtypes: float64(2), int64(2), object(3)\n",
      "memory usage: 151.7+ KB\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>25%</th>\n",
       "      <th>50%</th>\n",
       "      <th>75%</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>age</th>\n",
       "      <td>2772.0</td>\n",
       "      <td>39.109668</td>\n",
       "      <td>14.081459</td>\n",
       "      <td>18.0000</td>\n",
       "      <td>26.000</td>\n",
       "      <td>39.00000</td>\n",
       "      <td>51.0000</td>\n",
       "      <td>64.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bmi</th>\n",
       "      <td>2772.0</td>\n",
       "      <td>30.701349</td>\n",
       "      <td>6.129449</td>\n",
       "      <td>15.9600</td>\n",
       "      <td>26.220</td>\n",
       "      <td>30.44750</td>\n",
       "      <td>34.7700</td>\n",
       "      <td>53.13000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>children</th>\n",
       "      <td>2772.0</td>\n",
       "      <td>1.101732</td>\n",
       "      <td>1.214806</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>1.00000</td>\n",
       "      <td>2.0000</td>\n",
       "      <td>5.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>charges</th>\n",
       "      <td>2772.0</td>\n",
       "      <td>13261.369959</td>\n",
       "      <td>12151.768945</td>\n",
       "      <td>1121.8739</td>\n",
       "      <td>4687.797</td>\n",
       "      <td>9333.01435</td>\n",
       "      <td>16577.7795</td>\n",
       "      <td>63770.42801</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           count          mean           std        min       25%         50%  \\\n",
       "age       2772.0     39.109668     14.081459    18.0000    26.000    39.00000   \n",
       "bmi       2772.0     30.701349      6.129449    15.9600    26.220    30.44750   \n",
       "children  2772.0      1.101732      1.214806     0.0000     0.000     1.00000   \n",
       "charges   2772.0  13261.369959  12151.768945  1121.8739  4687.797  9333.01435   \n",
       "\n",
       "                 75%          max  \n",
       "age          51.0000     64.00000  \n",
       "bmi          34.7700     53.13000  \n",
       "children      2.0000      5.00000  \n",
       "charges   16577.7795  63770.42801  "
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('csv/5.medical_insurance.csv')\n",
    "df.info()\n",
    "df.describe().transpose()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Присутствуют ли пустые значения признаков в колонке:\n",
      "age         False\n",
      "sex         False\n",
      "bmi         False\n",
      "children    False\n",
      "smoker      False\n",
      "region      False\n",
      "charges     False\n",
      "dtype: bool \n",
      "\n"
     ]
    }
   ],
   "source": [
    "check_null_columns(df)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
-   "name": "python"
+   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.6"
  }
 },
 "nbformat": 4,