From f26b7ea386b22609d1dc2b00ad45118e1487bda9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9D=D0=B8=D0=BA=D0=B8=D1=82=D0=B0=20=D0=9F=D0=BE=D1=82?= =?UTF-8?q?=D0=B0=D0=BF=D0=BE=D0=B2?= Date: Sat, 23 Nov 2024 18:13:22 +0400 Subject: [PATCH] =?UTF-8?q?=D0=9F=D1=80=D0=BE=D0=B2=D0=B5=D1=80=D0=B8?= =?UTF-8?q?=D0=BB=20=D0=B4=D0=B0=D1=82=D0=B0=D1=84=D1=80=D0=B5=D0=B9=D0=BC?= =?UTF-8?q?=D1=8B=20=D0=BD=D0=B0=20=D1=81=D0=BE=D0=B4=D0=B5=D1=80=D0=B6?= =?UTF-8?q?=D0=B0=D0=BD=D0=B8=D0=B5=20=D0=BF=D1=83=D1=81=D1=82=D1=8B=D1=85?= =?UTF-8?q?=20=D0=BF=D1=80=D0=B8=D0=B7=D0=BD=D0=B0=D0=BA=D0=BE=D0=B2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_2/lab2.ipynb | 568 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 567 insertions(+), 1 deletion(-) diff --git a/lab_2/lab2.ipynb b/lab_2/lab2.ipynb index e8cf84b..f7bb449 100644 --- a/lab_2/lab2.ipynb +++ b/lab_2/lab2.ipynb @@ -93,6 +93,237 @@ " - **Целевой признак:** Зарплата в зависимости от удаленности работы и типа занятости (фиксированная сумма или разница в зарплатах для разных типов занятости)." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Выполним все необходимые импорты" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Any\n", + "from math import ceil\n", + "\n", + "import pandas as pd\n", + "from pandas import DataFrame, Series\n", + "from sklearn.model_selection import train_test_split\n", + "from imblearn.over_sampling import ADASYN\n", + "from imblearn.under_sampling import RandomUnderSampler\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Считаем данные для первого датасета" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 3755 entries, 0 to 3754\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 work_year 3755 non-null int64 \n", + " 1 experience_level 3755 non-null object\n", + " 2 employment_type 3755 non-null object\n", + " 3 job_title 3755 non-null object\n", + " 4 salary 3755 non-null int64 \n", + " 5 salary_currency 3755 non-null object\n", + " 6 salary_in_usd 3755 non-null int64 \n", + " 7 employee_residence 3755 non-null object\n", + " 8 remote_ratio 3755 non-null int64 \n", + " 9 company_location 3755 non-null object\n", + " 10 company_size 3755 non-null object\n", + "dtypes: int64(4), object(7)\n", + "memory usage: 322.8+ KB\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countmeanstdmin25%50%75%max
work_year3755.02022.3736350.6914482020.02022.02022.02023.02023.0
salary3755.0190695.571771671676.5005086000.0100000.0138000.0180000.030400000.0
salary_in_usd3755.0137570.38988063055.6252785132.095000.0135000.0175000.0450000.0
remote_ratio3755.046.27163848.5890500.00.00.0100.0100.0
\n", + "
" + ], + "text/plain": [ + " count mean std min 25% \\\n", + "work_year 3755.0 2022.373635 0.691448 2020.0 2022.0 \n", + "salary 3755.0 190695.571771 671676.500508 6000.0 100000.0 \n", + "salary_in_usd 3755.0 137570.389880 63055.625278 5132.0 95000.0 \n", + "remote_ratio 3755.0 46.271638 48.589050 0.0 0.0 \n", + "\n", + " 50% 75% max \n", + "work_year 2022.0 2023.0 2023.0 \n", + "salary 138000.0 180000.0 30400000.0 \n", + "salary_in_usd 135000.0 175000.0 450000.0 \n", + "remote_ratio 0.0 100.0 100.0 " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('csv/8.ds_salaries.csv')\n", + "df.info()\n", + "df.describe().transpose()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Метод проверки пустых значений в датафрейме" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "# Проверка пропущенных данных\n", + "def check_null_columns(dataframe: DataFrame) -> None:\n", + " print('Присутствуют ли пустые значения признаков в колонке:')\n", + " print(dataframe.isnull().any(), '\\n')\n", + "\n", + " if any(dataframe.isnull().any()):\n", + " print('Количество пустых значений признаков в колонке:')\n", + " print(dataframe.isnull().sum(), '\\n')\n", + "\n", + " print('Процент пустых значений признаков в колонке:')\n", + " for column in dataframe.columns:\n", + " null_rate: float = dataframe[column].isnull().sum() / len(dataframe) * 100\n", + " if null_rate > 0:\n", + " print(f\"{column} процент пустых значений: {null_rate:.2f}%\") " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Проверим на пустые значения в колонках" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Присутствуют ли пустые значения признаков в колонке:\n", + "work_year False\n", + "experience_level False\n", + "employment_type False\n", + "job_title False\n", + "salary False\n", + "salary_currency False\n", + "salary_in_usd False\n", + "employee_residence False\n", + "remote_ratio False\n", + "company_location False\n", + "company_size False\n", + "dtype: bool \n", + "\n" + ] + } + ], + "source": [ + "check_null_columns(df)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -192,6 +423,171 @@ " - **Цель**: Прогнозировать объем продаж (Store_Sales)." ] }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 896 entries, 0 to 895\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 Store ID 896 non-null int64\n", + " 1 Store_Area 896 non-null int64\n", + " 2 Items_Available 896 non-null int64\n", + " 3 Daily_Customer_Count 896 non-null int64\n", + " 4 Store_Sales 896 non-null int64\n", + "dtypes: int64(5)\n", + "memory usage: 35.1 KB\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countmeanstdmin25%50%75%max
Store ID896.0448.500000258.7972181.0224.75448.5672.25896.0
Store_Area896.01485.409598250.237011775.01316.751477.01653.502229.0
Items_Available896.01782.035714299.872053932.01575.501773.51982.752667.0
Daily_Customer_Count896.0786.350446265.38928110.0600.00780.0970.001560.0
Store_Sales896.059351.30580417190.74189514920.046530.0058605.071872.50116320.0
\n", + "
" + ], + "text/plain": [ + " count mean std min 25% \\\n", + "Store ID 896.0 448.500000 258.797218 1.0 224.75 \n", + "Store_Area 896.0 1485.409598 250.237011 775.0 1316.75 \n", + "Items_Available 896.0 1782.035714 299.872053 932.0 1575.50 \n", + "Daily_Customer_Count 896.0 786.350446 265.389281 10.0 600.00 \n", + "Store_Sales 896.0 59351.305804 17190.741895 14920.0 46530.00 \n", + "\n", + " 50% 75% max \n", + "Store ID 448.5 672.25 896.0 \n", + "Store_Area 1477.0 1653.50 2229.0 \n", + "Items_Available 1773.5 1982.75 2667.0 \n", + "Daily_Customer_Count 780.0 970.00 1560.0 \n", + "Store_Sales 58605.0 71872.50 116320.0 " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('csv/9.Stores.csv')\n", + "df.info()\n", + "df.describe().transpose()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Присутствуют ли пустые значения признаков в колонке:\n", + "Store ID False\n", + "Store_Area False\n", + "Items_Available False\n", + "Daily_Customer_Count False\n", + "Store_Sales False\n", + "dtype: bool \n", + "\n" + ] + } + ], + "source": [ + "check_null_columns(df)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -265,11 +661,181 @@ "\n", "Каждый из этих проектов направлен на повышение прибыльности компании, улучшение персонализированного подхода к клиентам и снижение финансовых рисков." ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 2772 entries, 0 to 2771\n", + "Data columns (total 7 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 age 2772 non-null int64 \n", + " 1 sex 2772 non-null object \n", + " 2 bmi 2772 non-null float64\n", + " 3 children 2772 non-null int64 \n", + " 4 smoker 2772 non-null object \n", + " 5 region 2772 non-null object \n", + " 6 charges 2772 non-null float64\n", + "dtypes: float64(2), int64(2), object(3)\n", + "memory usage: 151.7+ KB\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countmeanstdmin25%50%75%max
age2772.039.10966814.08145918.000026.00039.0000051.000064.00000
bmi2772.030.7013496.12944915.960026.22030.4475034.770053.13000
children2772.01.1017321.2148060.00000.0001.000002.00005.00000
charges2772.013261.36995912151.7689451121.87394687.7979333.0143516577.779563770.42801
\n", + "
" + ], + "text/plain": [ + " count mean std min 25% 50% \\\n", + "age 2772.0 39.109668 14.081459 18.0000 26.000 39.00000 \n", + "bmi 2772.0 30.701349 6.129449 15.9600 26.220 30.44750 \n", + "children 2772.0 1.101732 1.214806 0.0000 0.000 1.00000 \n", + "charges 2772.0 13261.369959 12151.768945 1121.8739 4687.797 9333.01435 \n", + "\n", + " 75% max \n", + "age 51.0000 64.00000 \n", + "bmi 34.7700 53.13000 \n", + "children 2.0000 5.00000 \n", + "charges 16577.7795 63770.42801 " + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('csv/5.medical_insurance.csv')\n", + "df.info()\n", + "df.describe().transpose()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Присутствуют ли пустые значения признаков в колонке:\n", + "age False\n", + "sex False\n", + "bmi False\n", + "children False\n", + "smoker False\n", + "region False\n", + "charges False\n", + "dtype: bool \n", + "\n" + ] + } + ], + "source": [ + "check_null_columns(df)" + ] } ], "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.6" } }, "nbformat": 4,