Проверил датафреймы на содержание пустых признаков

This commit is contained in:
Никита Потапов 2024-11-23 18:13:22 +04:00
parent 87062d0743
commit f26b7ea386

View File

@ -93,6 +93,237 @@
" - **Целевой признак:** Зарплата в зависимости от удаленности работы и типа занятости (фиксированная сумма или разница в зарплатах для разных типов занятости)." " - **Целевой признак:** Зарплата в зависимости от удаленности работы и типа занятости (фиксированная сумма или разница в зарплатах для разных типов занятости)."
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выполним все необходимые импорты"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from typing import Any\n",
"from math import ceil\n",
"\n",
"import pandas as pd\n",
"from pandas import DataFrame, Series\n",
"from sklearn.model_selection import train_test_split\n",
"from imblearn.over_sampling import ADASYN\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Считаем данные для первого датасета"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 3755 entries, 0 to 3754\n",
"Data columns (total 11 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 work_year 3755 non-null int64 \n",
" 1 experience_level 3755 non-null object\n",
" 2 employment_type 3755 non-null object\n",
" 3 job_title 3755 non-null object\n",
" 4 salary 3755 non-null int64 \n",
" 5 salary_currency 3755 non-null object\n",
" 6 salary_in_usd 3755 non-null int64 \n",
" 7 employee_residence 3755 non-null object\n",
" 8 remote_ratio 3755 non-null int64 \n",
" 9 company_location 3755 non-null object\n",
" 10 company_size 3755 non-null object\n",
"dtypes: int64(4), object(7)\n",
"memory usage: 322.8+ KB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>min</th>\n",
" <th>25%</th>\n",
" <th>50%</th>\n",
" <th>75%</th>\n",
" <th>max</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>work_year</th>\n",
" <td>3755.0</td>\n",
" <td>2022.373635</td>\n",
" <td>0.691448</td>\n",
" <td>2020.0</td>\n",
" <td>2022.0</td>\n",
" <td>2022.0</td>\n",
" <td>2023.0</td>\n",
" <td>2023.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>salary</th>\n",
" <td>3755.0</td>\n",
" <td>190695.571771</td>\n",
" <td>671676.500508</td>\n",
" <td>6000.0</td>\n",
" <td>100000.0</td>\n",
" <td>138000.0</td>\n",
" <td>180000.0</td>\n",
" <td>30400000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>salary_in_usd</th>\n",
" <td>3755.0</td>\n",
" <td>137570.389880</td>\n",
" <td>63055.625278</td>\n",
" <td>5132.0</td>\n",
" <td>95000.0</td>\n",
" <td>135000.0</td>\n",
" <td>175000.0</td>\n",
" <td>450000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>remote_ratio</th>\n",
" <td>3755.0</td>\n",
" <td>46.271638</td>\n",
" <td>48.589050</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>100.0</td>\n",
" <td>100.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" count mean std min 25% \\\n",
"work_year 3755.0 2022.373635 0.691448 2020.0 2022.0 \n",
"salary 3755.0 190695.571771 671676.500508 6000.0 100000.0 \n",
"salary_in_usd 3755.0 137570.389880 63055.625278 5132.0 95000.0 \n",
"remote_ratio 3755.0 46.271638 48.589050 0.0 0.0 \n",
"\n",
" 50% 75% max \n",
"work_year 2022.0 2023.0 2023.0 \n",
"salary 138000.0 180000.0 30400000.0 \n",
"salary_in_usd 135000.0 175000.0 450000.0 \n",
"remote_ratio 0.0 100.0 100.0 "
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('csv/8.ds_salaries.csv')\n",
"df.info()\n",
"df.describe().transpose()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Метод проверки пустых значений в датафрейме"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"# Проверка пропущенных данных\n",
"def check_null_columns(dataframe: DataFrame) -> None:\n",
" print('Присутствуют ли пустые значения признаков в колонке:')\n",
" print(dataframe.isnull().any(), '\\n')\n",
"\n",
" if any(dataframe.isnull().any()):\n",
" print('Количество пустых значений признаков в колонке:')\n",
" print(dataframe.isnull().sum(), '\\n')\n",
"\n",
" print('Процент пустых значений признаков в колонке:')\n",
" for column in dataframe.columns:\n",
" null_rate: float = dataframe[column].isnull().sum() / len(dataframe) * 100\n",
" if null_rate > 0:\n",
" print(f\"{column} процент пустых значений: {null_rate:.2f}%\") "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проверим на пустые значения в колонках"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Присутствуют ли пустые значения признаков в колонке:\n",
"work_year False\n",
"experience_level False\n",
"employment_type False\n",
"job_title False\n",
"salary False\n",
"salary_currency False\n",
"salary_in_usd False\n",
"employee_residence False\n",
"remote_ratio False\n",
"company_location False\n",
"company_size False\n",
"dtype: bool \n",
"\n"
]
}
],
"source": [
"check_null_columns(df)"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
@ -192,6 +423,171 @@
" - **Цель**: Прогнозировать объем продаж (Store_Sales)." " - **Цель**: Прогнозировать объем продаж (Store_Sales)."
] ]
}, },
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 896 entries, 0 to 895\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype\n",
"--- ------ -------------- -----\n",
" 0 Store ID 896 non-null int64\n",
" 1 Store_Area 896 non-null int64\n",
" 2 Items_Available 896 non-null int64\n",
" 3 Daily_Customer_Count 896 non-null int64\n",
" 4 Store_Sales 896 non-null int64\n",
"dtypes: int64(5)\n",
"memory usage: 35.1 KB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>min</th>\n",
" <th>25%</th>\n",
" <th>50%</th>\n",
" <th>75%</th>\n",
" <th>max</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Store ID</th>\n",
" <td>896.0</td>\n",
" <td>448.500000</td>\n",
" <td>258.797218</td>\n",
" <td>1.0</td>\n",
" <td>224.75</td>\n",
" <td>448.5</td>\n",
" <td>672.25</td>\n",
" <td>896.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Store_Area</th>\n",
" <td>896.0</td>\n",
" <td>1485.409598</td>\n",
" <td>250.237011</td>\n",
" <td>775.0</td>\n",
" <td>1316.75</td>\n",
" <td>1477.0</td>\n",
" <td>1653.50</td>\n",
" <td>2229.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Items_Available</th>\n",
" <td>896.0</td>\n",
" <td>1782.035714</td>\n",
" <td>299.872053</td>\n",
" <td>932.0</td>\n",
" <td>1575.50</td>\n",
" <td>1773.5</td>\n",
" <td>1982.75</td>\n",
" <td>2667.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Daily_Customer_Count</th>\n",
" <td>896.0</td>\n",
" <td>786.350446</td>\n",
" <td>265.389281</td>\n",
" <td>10.0</td>\n",
" <td>600.00</td>\n",
" <td>780.0</td>\n",
" <td>970.00</td>\n",
" <td>1560.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Store_Sales</th>\n",
" <td>896.0</td>\n",
" <td>59351.305804</td>\n",
" <td>17190.741895</td>\n",
" <td>14920.0</td>\n",
" <td>46530.00</td>\n",
" <td>58605.0</td>\n",
" <td>71872.50</td>\n",
" <td>116320.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" count mean std min 25% \\\n",
"Store ID 896.0 448.500000 258.797218 1.0 224.75 \n",
"Store_Area 896.0 1485.409598 250.237011 775.0 1316.75 \n",
"Items_Available 896.0 1782.035714 299.872053 932.0 1575.50 \n",
"Daily_Customer_Count 896.0 786.350446 265.389281 10.0 600.00 \n",
"Store_Sales 896.0 59351.305804 17190.741895 14920.0 46530.00 \n",
"\n",
" 50% 75% max \n",
"Store ID 448.5 672.25 896.0 \n",
"Store_Area 1477.0 1653.50 2229.0 \n",
"Items_Available 1773.5 1982.75 2667.0 \n",
"Daily_Customer_Count 780.0 970.00 1560.0 \n",
"Store_Sales 58605.0 71872.50 116320.0 "
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('csv/9.Stores.csv')\n",
"df.info()\n",
"df.describe().transpose()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Присутствуют ли пустые значения признаков в колонке:\n",
"Store ID False\n",
"Store_Area False\n",
"Items_Available False\n",
"Daily_Customer_Count False\n",
"Store_Sales False\n",
"dtype: bool \n",
"\n"
]
}
],
"source": [
"check_null_columns(df)"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
@ -265,11 +661,181 @@
"\n", "\n",
"Каждый из этих проектов направлен на повышение прибыльности компании, улучшение персонализированного подхода к клиентам и снижение финансовых рисков." "Каждый из этих проектов направлен на повышение прибыльности компании, улучшение персонализированного подхода к клиентам и снижение финансовых рисков."
] ]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 2772 entries, 0 to 2771\n",
"Data columns (total 7 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 age 2772 non-null int64 \n",
" 1 sex 2772 non-null object \n",
" 2 bmi 2772 non-null float64\n",
" 3 children 2772 non-null int64 \n",
" 4 smoker 2772 non-null object \n",
" 5 region 2772 non-null object \n",
" 6 charges 2772 non-null float64\n",
"dtypes: float64(2), int64(2), object(3)\n",
"memory usage: 151.7+ KB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>min</th>\n",
" <th>25%</th>\n",
" <th>50%</th>\n",
" <th>75%</th>\n",
" <th>max</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>age</th>\n",
" <td>2772.0</td>\n",
" <td>39.109668</td>\n",
" <td>14.081459</td>\n",
" <td>18.0000</td>\n",
" <td>26.000</td>\n",
" <td>39.00000</td>\n",
" <td>51.0000</td>\n",
" <td>64.00000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bmi</th>\n",
" <td>2772.0</td>\n",
" <td>30.701349</td>\n",
" <td>6.129449</td>\n",
" <td>15.9600</td>\n",
" <td>26.220</td>\n",
" <td>30.44750</td>\n",
" <td>34.7700</td>\n",
" <td>53.13000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>children</th>\n",
" <td>2772.0</td>\n",
" <td>1.101732</td>\n",
" <td>1.214806</td>\n",
" <td>0.0000</td>\n",
" <td>0.000</td>\n",
" <td>1.00000</td>\n",
" <td>2.0000</td>\n",
" <td>5.00000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>charges</th>\n",
" <td>2772.0</td>\n",
" <td>13261.369959</td>\n",
" <td>12151.768945</td>\n",
" <td>1121.8739</td>\n",
" <td>4687.797</td>\n",
" <td>9333.01435</td>\n",
" <td>16577.7795</td>\n",
" <td>63770.42801</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" count mean std min 25% 50% \\\n",
"age 2772.0 39.109668 14.081459 18.0000 26.000 39.00000 \n",
"bmi 2772.0 30.701349 6.129449 15.9600 26.220 30.44750 \n",
"children 2772.0 1.101732 1.214806 0.0000 0.000 1.00000 \n",
"charges 2772.0 13261.369959 12151.768945 1121.8739 4687.797 9333.01435 \n",
"\n",
" 75% max \n",
"age 51.0000 64.00000 \n",
"bmi 34.7700 53.13000 \n",
"children 2.0000 5.00000 \n",
"charges 16577.7795 63770.42801 "
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('csv/5.medical_insurance.csv')\n",
"df.info()\n",
"df.describe().transpose()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Присутствуют ли пустые значения признаков в колонке:\n",
"age False\n",
"sex False\n",
"bmi False\n",
"children False\n",
"smoker False\n",
"region False\n",
"charges False\n",
"dtype: bool \n",
"\n"
]
}
],
"source": [
"check_null_columns(df)"
]
} }
], ],
"metadata": { "metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": { "language_info": {
"name": "python" "codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
} }
}, },
"nbformat": 4, "nbformat": 4,