From efeb865791085740073c121009e10a7935a6ef57 Mon Sep 17 00:00:00 2001 From: dex_moth Date: Fri, 20 Dec 2024 23:47:13 +0400 Subject: [PATCH 1/7] =?UTF-8?q?lab=203=20=D0=B8=204?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_3/Lab3.ipynb | 551 ++++++++++++++++++++++++++++ lab_4/Lab4.ipynb | 911 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1462 insertions(+) create mode 100644 lab_3/Lab3.ipynb create mode 100644 lab_4/Lab4.ipynb diff --git a/lab_3/Lab3.ipynb b/lab_3/Lab3.ipynb new file mode 100644 index 0000000..dcb1c9d --- /dev/null +++ b/lab_3/Lab3.ipynb @@ -0,0 +1,551 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Лабораторная 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Вариант 7. Экономика стран" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Бизнес-цели:\n", + "1) прогнозирование уровня инфляции на основе данных за года\n", + "2) определение факторов, значительно влияющих на показателль ВВП на душу населения" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Технические цели:\n", + "1) Разработать МО для прогнозирования уровня инфляции на основе исторических данных\n", + "2) Проанализировать взаимосвязь между экономическими показателями и ВВП" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['stock index', 'country', 'year', 'index price', 'log_indexprice',\n", + " 'inflationrate', 'oil prices', 'exchange_rate', 'gdppercent',\n", + " 'percapitaincome', 'unemploymentrate', 'manufacturingoutput',\n", + " 'tradebalance', 'USTreasury'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "df = pd.read_csv(\".//csv//EconomicData.csv\")\n", + "print(df.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Подготовка данных:" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "stock index 0\n", + "country 0\n", + "year 0\n", + "index price 52\n", + "log_indexprice 0\n", + "inflationrate 43\n", + "oil prices 0\n", + "exchange_rate 2\n", + "gdppercent 19\n", + "percapitaincome 1\n", + "unemploymentrate 21\n", + "manufacturingoutput 91\n", + "tradebalance 4\n", + "USTreasury 0\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "print(df.isnull().sum())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Заполним пустые значения медианами:" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "for column in df.columns:\n", + " if (column != \"stock index\" and column != \"country\"):\n", + " df[column].fillna(df[column].median())" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(12, 8))\n", + "ax = sns.scatterplot(x='exchange_rate', y='oil prices', hue='inflationrate', data=df)\n", + "plt.title('Уровень инфляции')\n", + "plt.xlabel('Валютный курс')\n", + "plt.ylabel('Цены на нефть')\n", + "plt.legend(title='inflationrate')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "Q1 = df['oil prices'].quantile(0.25)\n", + "Q3 = df['oil prices'].quantile(0.75)\n", + "IQR = Q3 - Q1\n", + "\n", + "threshold = 1.5 * IQR\n", + "outliers = (df['oil prices'] < (Q1 - threshold)) | (df['oil prices'] > (Q3 + threshold))\n", + "\n", + "median_rating = df['oil prices'].median()\n", + "df.loc[outliers, 'oil prices'] = median_rating\n", + "\n", + "plt.figure(figsize=(12, 8))\n", + "ax = sns.scatterplot(x='exchange_rate', y='gdppercent', hue='inflationrate', data=df)\n", + "plt.title('Уровень инфляции')\n", + "plt.xlabel('Валютный курс')\n", + "plt.ylabel('Цены на нефть')\n", + "plt.legend(title='inflationrate')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Разбиение данных на выборки и оценка сбалансированности выборки" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки: 221\n", + "Размер контрольной выборки: 74\n", + "Размер тестовой выборки: 74\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "# обучающая и тестовая\n", + "train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)\n", + "\n", + "# обучающая на обучающую и контрольную\n", + "train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n", + "\n", + "print(\"Размер обучающей выборки:\", len(train_df))\n", + "print(\"Размер контрольной выборки:\", len(val_df))\n", + "print(\"Размер тестовой выборки:\", len(test_df))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Конструирование признаков" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1) Кодирование категориальных признаков" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from sklearn.preprocessing import OneHotEncoder\n", + "\n", + "df = pd.get_dummies(df, columns=['country'])\n", + "print(df.head)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. Дискретизация числовых признаков" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "min = 27.0\n", + "max = 65280.0\n", + "10880.0\n" + ] + } + ], + "source": [ + "print(f\"min = {df['percapitaincome'].min()}\")\n", + "print(f\"max = {df['percapitaincome'].max()}\")\n", + "print(df['percapitaincome'].max()/6)" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from sklearn.preprocessing import KBinsDiscretizer\n", + "\n", + "bins = [0, 11000, 22000, 33000, 44000, float('inf')]\n", + "labels = ['незначительный', 'низкий', 'средний', 'высокий', 'очень высокий']\n", + "\n", + "df['percapitaincome_level'] = pd.cut(df['percapitaincome'], bins=bins, labels=labels)\n", + "print(df['percapitaincome_level'].head)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3) Ручной синтез признаков" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n", + " warnings.warn(\n", + "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\featuretools\\synthesis\\dfs.py:321: UnusedPrimitiveWarning: Some specified primitives were not used during DFS:\n", + " agg_primitives: ['max', 'mean', 'min', 'std', 'sum']\n", + "This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.\n", + " warnings.warn(warning_msg, UnusedPrimitiveWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Сгенерированные признаки:\n", + " stock index year index price log_indexprice inflationrate \\\n", + "index \n", + "0 NASDAQ 1980.0 168.61 2.23 0.14 \n", + "1 NASDAQ 1981.0 203.15 2.31 0.10 \n", + "2 NASDAQ 1982.0 188.98 2.28 0.06 \n", + "3 NASDAQ 1983.0 285.43 2.46 0.03 \n", + "4 NASDAQ 1984.0 248.89 2.40 0.04 \n", + "\n", + " oil prices exchange_rate gdppercent percapitaincome \\\n", + "index \n", + "0 21.59 1.0 0.09 12575 \n", + "1 31.77 1.0 0.12 13976 \n", + "2 28.52 1.0 0.04 14434 \n", + "3 26.19 1.0 0.09 15544 \n", + "4 25.88 1.0 0.11 17121 \n", + "\n", + " unemploymentrate ... oil prices * year percapitaincome * USTreasury \\\n", + "index ... \n", + "0 0.07 ... 42748.20 1383.25 \n", + "1 0.08 ... 62936.37 1956.64 \n", + "2 0.10 ... 56526.64 1876.42 \n", + "3 0.10 ... 51934.77 1709.84 \n", + "4 0.08 ... 51345.92 2054.52 \n", + "\n", + " percapitaincome * tradebalance percapitaincome * unemploymentrate \\\n", + "index \n", + "0 -164229.50 880.25 \n", + "1 -174979.52 1118.08 \n", + "2 -288246.98 1443.40 \n", + "3 -802692.16 1554.40 \n", + "4 -1758840.33 1369.68 \n", + "\n", + " percapitaincome * year tradebalance * USTreasury \\\n", + "index \n", + "0 24898500.0 -1.4366 \n", + "1 27686456.0 -1.7528 \n", + "2 28608188.0 -2.5961 \n", + "3 30823752.0 -5.6804 \n", + "4 33968064.0 -12.3276 \n", + "\n", + " tradebalance * unemploymentrate tradebalance * year \\\n", + "index \n", + "0 -0.9142 -25858.80 \n", + "1 -1.0016 -24802.12 \n", + "2 -1.9970 -39580.54 \n", + "3 -5.1640 -102402.12 \n", + "4 -8.2184 -203816.32 \n", + "\n", + " unemploymentrate * USTreasury unemploymentrate * year \n", + "index \n", + "0 0.0077 138.60 \n", + "1 0.0112 158.48 \n", + "2 0.0130 198.20 \n", + "3 0.0110 198.30 \n", + "4 0.0096 158.72 \n", + "\n", + "[5 rows x 207 columns]\n", + "\n", + "Описание:\n", + "[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ]\n" + ] + } + ], + "source": [ + "# pip install featuretools\n", + "import featuretools as ft\n", + "\n", + "es = ft.EntitySet(id='economy_data')\n", + "es.add_dataframe(\n", + " dataframe=df,\n", + " dataframe_name='economy',\n", + " index='index',\n", + " make_index=True\n", + ")\n", + "\n", + "# Автоматическое конструирование\n", + "feature_matrix, feature_defs = ft.dfs(\n", + " entityset=es,\n", + " target_dataframe_name='economy',\n", + " agg_primitives=['mean', 'sum', 'max', 'min', 'std'],\n", + " trans_primitives=['add_numeric', 'multiply_numeric'],\n", + " max_depth=2 \n", + ")\n", + "\n", + "print(\"Сгенерированные признаки:\")\n", + "print(feature_matrix.head())\n", + "print(\"\\nОписание:\")\n", + "print(feature_defs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "4. Масштабирование" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import MinMaxScaler\n", + "\n", + "scaler_minmax = MinMaxScaler()\n", + "df[['index price_scaled', 'log_indexprice_scaled']] = scaler_minmax.fit_transform(df[['index price', 'log_indexprice']])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Оценка качества наборов признаков:\n", + "Набор данных достаточно полный, но требует предварительной обработки (заполнение пропусков, удаление выбросов, нормализация). После обработки он может быть использован для анализа и построения моделей." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Scripts", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lab_4/Lab4.ipynb b/lab_4/Lab4.ipynb new file mode 100644 index 0000000..0b8116e --- /dev/null +++ b/lab_4/Lab4.ipynb @@ -0,0 +1,911 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['id', 'Gender', 'Age', 'City', 'Profession', 'Academic Pressure',\n", + " 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',\n", + " 'Sleep Duration', 'Dietary Habits', 'Degree',\n", + " 'Have you ever had suicidal thoughts ?', 'Work/Study Hours',\n", + " 'Financial Stress', 'Family History of Mental Illness', 'Depression'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from matplotlib.ticker import FuncFormatter\n", + "\n", + "df = pd.read_csv(\".//csv//Student Depression Dataset.csv\")\n", + "print(df.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " id Gender Age City Profession Academic Pressure \\\n", + "0 2 Male 33.0 Visakhapatnam Student 5.0 \n", + "1 8 Female 24.0 Bangalore Student 2.0 \n", + "2 26 Male 31.0 Srinagar Student 3.0 \n", + "3 30 Female 28.0 Varanasi Student 3.0 \n", + "4 32 Female 25.0 Jaipur Student 4.0 \n", + "\n", + " Work Pressure CGPA Study Satisfaction Job Satisfaction \\\n", + "0 0.0 8.97 2.0 0.0 \n", + "1 0.0 5.90 5.0 0.0 \n", + "2 0.0 7.03 5.0 0.0 \n", + "3 0.0 5.59 2.0 0.0 \n", + "4 0.0 8.13 3.0 0.0 \n", + "\n", + " Sleep Duration Dietary Habits Degree \\\n", + "0 5-6 hours Healthy B.Pharm \n", + "1 5-6 hours Moderate BSc \n", + "2 Less than 5 hours Healthy BA \n", + "3 7-8 hours Moderate BCA \n", + "4 5-6 hours Moderate M.Tech \n", + "\n", + " Have you ever had suicidal thoughts ? Work/Study Hours Financial Stress \\\n", + "0 Yes 3.0 1.0 \n", + "1 No 3.0 2.0 \n", + "2 No 9.0 1.0 \n", + "3 Yes 4.0 5.0 \n", + "4 Yes 1.0 1.0 \n", + "\n", + " Family History of Mental Illness Depression \n", + "0 No 1 \n", + "1 Yes 0 \n", + "2 Yes 0 \n", + "3 Yes 1 \n", + "4 No 0 \n" + ] + } + ], + "source": [ + "print(df.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Бизнес-цель исследования\n", + "Разработать и внедрить систему прогнозирования уровня депрессии среди обучающихся, которая позволит выявить группы риска на ранних этапах. Результаты исследования могут быть полезны психологам, педагогам и администрации учебных заведений.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Описание набора данных для анализа\n", + "Набор данных содержит информацию о психологическом состоянии обучающихся и включает следующие поля:\n", + "- id – идентификатор, число\n", + "- Gender – пол, строка\n", + "- Age – возраст, дробное число\n", + "- City – город, строка\n", + "- Profession – профессия, строка\n", + "- Academic Pressure – академическое давление, дробное число (от 1.00 до 5.00)\n", + "- Work Pressure – рабочее давление, дробное число (от 1.00 до 5.00)\n", + "- CGPA – средний балл (GPA), дробное число\n", + "- Study Satisfaction – удовлетворенность учебой, дробное число (от 1.00 до 5.00)\n", + "- Job Satisfaction – удовлетворенность работой, дробное число (от 1.00 до 5.00)\n", + "- Sleep Duration – продолжительность сна, строка\n", + "- Dietary Habits – пищевые привычки, строка\n", + "- Degree – степень (образование), строка\n", + "- Have you ever had suicidal thoughts? – Были ли у вас когда-либо суицидальные мысли? строка (yes/no)\n", + "- Work/Study Hours – часы работы/учебы, дробное число\n", + "- Financial Stress – финансовый стресс, дробное число (от 1.00 до 5.00)\n", + "- Family History of Mental Illness – семейный анамнез психических заболеваний, строка (yes/no)\n", + "- Depression – депрессия, булевое значение (1/0)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Обработка данных" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "Gender 0\n", + "Age 0\n", + "City 0\n", + "Profession 0\n", + "Academic Pressure 0\n", + "Work Pressure 0\n", + "CGPA 0\n", + "Study Satisfaction 0\n", + "Job Satisfaction 0\n", + "Sleep Duration 0\n", + "Dietary Habits 0\n", + "Degree 0\n", + "Have you ever had suicidal thoughts ? 0\n", + "Work/Study Hours 0\n", + "Financial Stress 3\n", + "Family History of Mental Illness 0\n", + "Depression 0\n", + "dtype: int64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "df.dropna(subset=['Financial Stress'], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "features = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', \n", + " 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress', 'Depression']\n", + "\n", + "plt.figure(figsize=(15, 10))\n", + "for i, feature in enumerate(features, 1):\n", + " plt.subplot(3, 3, i)\n", + " sns.boxplot(y=df[feature], color='skyblue')\n", + " plt.title(f'Boxplot of {feature}')\n", + " plt.ylabel(feature)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В Age много выбросов. Сбалансируем данные" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "Q1 = df['Age'].quantile(0.25)\n", + "Q3 = df['Age'].quantile(0.75)\n", + "IQR = Q3 - Q1\n", + "\n", + "threshold = 1.5 * IQR\n", + "outliers = (df['Age'] < (Q1 - threshold)) | (df['Age'] > (Q3 + threshold))\n", + "\n", + "median_rating = df['Age'].median()\n", + "df.loc[outliers, 'Age'] = median_rating\n", + "\n", + "plt.figure(figsize=(8, 6))\n", + "sns.boxplot(y=df['Age'], color='skyblue')\n", + "plt.title('Boxplot of Age')\n", + "plt.ylabel('Age')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Конструирование признаков с помощью меток" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "le = LabelEncoder()\n", + "df['Gender'] = le.fit_transform(df['Gender'])\n", + "df['City'] = le.fit_transform(df['City'])\n", + "df['Dietary Habits'] = le.fit_transform(df['Dietary Habits'])\n", + "df['Degree'] = le.fit_transform(df['Degree'])\n", + "df['Have you ever had suicidal thoughts ?'] = le.fit_transform(df['Have you ever had suicidal thoughts ?'])\n", + "df['Sleep Duration'] = le.fit_transform(df['Sleep Duration'])\n", + "df['Profession'] = le.fit_transform(df['Profession'])\n", + "df['Study Satisfaction'] = le.fit_transform(df['Study Satisfaction'])\n", + "df['Family History of Mental Illness'] = le.fit_transform(df['Family History of Mental Illness'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "разделение на признаки и целевую переменную" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "x = df.drop('Depression', axis=1)\n", + "y = df['Depression']" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1) Метод регрессии Лассо\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Лучшие гиперпараметры для Lasso:\n", + "{'alpha': 0.01, 'fit_intercept': False}\n" + ] + } + ], + "source": [ + "from sklearn.linear_model import Lasso\n", + "\n", + "param_grid_lasso = {\n", + " 'alpha': [0.01, 0.1, 1.0, 10.0],\n", + " 'fit_intercept': [True, False],\n", + "}\n", + "\n", + "# Создание объекта GridSearchCV\n", + "grid_search_lasso = GridSearchCV(\n", + " estimator=Lasso(), \n", + " param_grid=param_grid_lasso, \n", + " cv=5, \n", + " scoring='neg_mean_squared_error', \n", + " n_jobs=-1 \n", + ")\n", + "\n", + "grid_search_lasso.fit(x_train, y_train)\n", + "\n", + "# Вывод лучших гиперпараметров\n", + "print(\"Лучшие гиперпараметры для Lasso:\")\n", + "print(grid_search_lasso.best_params_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2) Метод градиентного бустинга" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:540: FitFailedWarning: \n", + "1215 fits failed out of a total of 3645.\n", + "The score on these train-test partitions for these parameters will be set to nan.\n", + "If these failures are not expected, you can try to debug them by setting error_score='raise'.\n", + "\n", + "Below are more details about the failures:\n", + "--------------------------------------------------------------------------------\n", + "978 fits failed with the following error:\n", + "Traceback (most recent call last):\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 888, in _fit_and_score\n", + " estimator.fit(X_train, y_train, **fit_params)\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 1466, in wrapper\n", + " estimator._validate_params()\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 666, in _validate_params\n", + " validate_parameter_constraints(\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py\", line 95, in validate_parameter_constraints\n", + " raise InvalidParameterError(\n", + "sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of GradientBoostingRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto' instead.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "237 fits failed with the following error:\n", + "Traceback (most recent call last):\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 888, in _fit_and_score\n", + " estimator.fit(X_train, y_train, **fit_params)\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 1466, in wrapper\n", + " estimator._validate_params()\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 666, in _validate_params\n", + " validate_parameter_constraints(\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py\", line 95, in validate_parameter_constraints\n", + " raise InvalidParameterError(\n", + "sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of GradientBoostingRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.\n", + "\n", + " warnings.warn(some_fits_failed_message, FitFailedWarning)\n", + "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\numpy\\ma\\core.py:2881: RuntimeWarning: invalid value encountered in cast\n", + " _data = np.array(data, dtype=dtype, copy=copy,\n", + "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:1103: UserWarning: One or more of the test scores are non-finite: [ nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan -0.18767441 -0.15799837 -0.13080278\n", + " -0.18762913 -0.15792709 -0.13056114 -0.18792038 -0.15737146 -0.130218\n", + " -0.18725961 -0.157967 -0.13047453 -0.18766583 -0.15779565 -0.13094863\n", + " -0.18798705 -0.15693978 -0.13061215 -0.18766317 -0.15746848 -0.13072918\n", + " -0.18864158 -0.15666133 -0.13095037 -0.18817206 -0.15805489 -0.13086126\n", + " -0.18707465 -0.15864932 -0.13104947 -0.18818902 -0.15828572 -0.13063871\n", + " -0.18701628 -0.15853864 -0.13019458 -0.18740927 -0.15836397 -0.13065455\n", + " -0.18768748 -0.15828297 -0.1309458 -0.18845004 -0.15696395 -0.13023062\n", + " -0.18754854 -0.15899615 -0.13061707 -0.18831427 -0.15819939 -0.13096524\n", + " -0.18662963 -0.15815869 -0.13089186 nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " -0.1758914 -0.1442684 -0.12093344 -0.1758927 -0.14423731 -0.12084543\n", + " -0.17573339 -0.14419842 -0.12076166 -0.17512045 -0.14435454 -0.1207299\n", + " -0.17669645 -0.14397965 -0.12087019 -0.17605424 -0.1438664 -0.12091068\n", + " -0.17582192 -0.1443651 -0.12097165 -0.17588422 -0.14421003 -0.12081764\n", + " -0.17522742 -0.14424357 -0.12086484 -0.17530986 -0.14433713 -0.12091757\n", + " -0.17565647 -0.14408902 -0.12075918 -0.17561884 -0.14426355 -0.12094066\n", + " -0.17522371 -0.1439869 -0.12099023 -0.17619772 -0.14396131 -0.12079667\n", + " -0.17710789 -0.1448419 -0.12087822 -0.17608534 -0.14416684 -0.12087865\n", + " -0.1754675 -0.1442258 -0.12068226 -0.17611334 -0.14433552 -0.12093556\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan -0.16938321 -0.13763002 -0.11703902\n", + " -0.16953091 -0.13736586 -0.11695779 -0.16881837 -0.1375676 -0.11694438\n", + " -0.16927898 -0.13748177 -0.11689982 -0.16921265 -0.13757375 -0.11682524\n", + " -0.16915872 -0.13727377 -0.11694336 -0.16939766 -0.13734972 -0.1167447\n", + " -0.16924214 -0.1373768 -0.11674816 -0.16918278 -0.13746085 -0.1169816\n", + " -0.16927003 -0.13740063 -0.1169564 -0.16916501 -0.13752074 -0.11687641\n", + " -0.16928973 -0.13751536 -0.11697948 -0.16934836 -0.13727436 -0.11693615\n", + " -0.16912453 -0.13748699 -0.11693425 -0.1692788 -0.13750784 -0.11694655\n", + " -0.16919354 -0.13747437 -0.11708782 -0.16940009 -0.13757749 -0.11700586\n", + " -0.1692801 -0.13725384 -0.11684394 nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " -0.11606052 -0.1140225 -0.11403709 -0.11627212 -0.1139982 -0.11402075\n", + " -0.11613561 -0.11407941 -0.11420487 -0.11666225 -0.11462523 -0.11431901\n", + " -0.11604817 -0.11456211 -0.11392092 -0.11609343 -0.11394228 -0.11414071\n", + " -0.11611685 -0.11420178 -0.11405459 -0.11594404 -0.11408614 -0.11391662\n", + " -0.11590886 -0.11396465 -0.11389125 -0.11616694 -0.11441846 -0.11417015\n", + " -0.11617368 -0.11429765 -0.1139636 -0.11616763 -0.11433984 -0.11412121\n", + " -0.11625618 -0.11402999 -0.11419791 -0.11613603 -0.114206 -0.11423922\n", + " -0.1160801 -0.11431896 -0.11416734 -0.11608923 -0.11455498 -0.11417448\n", + " -0.11605165 -0.11427773 -0.11392205 -0.11606243 -0.11408421 -0.11395292\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan -0.11281447 -0.11245904 -0.11308822\n", + " -0.11256366 -0.11230094 -0.1130767 -0.11282651 -0.1121034 -0.11283479\n", + " -0.11260704 -0.1125136 -0.11288977 -0.11278304 -0.11242278 -0.11268564\n", + " -0.11263359 -0.11236227 -0.11329411 -0.11231603 -0.1124533 -0.11278826\n", + " -0.11291545 -0.11241223 -0.11250702 -0.11246481 -0.11228665 -0.11348916\n", + " -0.11250694 -0.11250274 -0.11298019 -0.11277323 -0.11248601 -0.11301753\n", + " -0.11259486 -0.1124685 -0.11285441 -0.11274424 -0.11232891 -0.11316456\n", + " -0.11274575 -0.11256149 -0.11252293 -0.11293524 -0.11261757 -0.11305628\n", + " -0.11253063 -0.11237109 -0.11278518 -0.1124074 -0.11276905 -0.11296684\n", + " -0.11258689 -0.11228467 -0.11331342 nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " -0.11292265 -0.11395193 -0.11564599 -0.11244356 -0.11338947 -0.1148266\n", + " -0.11295702 -0.11353862 -0.11510521 -0.11244347 -0.11387967 -0.11512396\n", + " -0.11269802 -0.11364442 -0.1151339 -0.11238356 -0.11364301 -0.11496543\n", + " -0.11229193 -0.11340926 -0.11550744 -0.11215818 -0.11367944 -0.11552889\n", + " -0.11240305 -0.11352309 -0.115412 -0.1128402 -0.11338749 -0.1153551\n", + " -0.11250042 -0.11347275 -0.11548445 -0.11271132 -0.11377527 -0.11558066\n", + " -0.11318598 -0.11325792 -0.11499103 -0.11253099 -0.1129829 -0.11530949\n", + " -0.11239074 -0.11329625 -0.11544761 -0.11262484 -0.11323392 -0.1151936\n", + " -0.11253889 -0.11382403 -0.11511129 -0.11250854 -0.11339898 -0.11536332\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan -0.11542253 -0.11498664 -0.11428517\n", + " -0.11503783 -0.11473447 -0.11458687 -0.11483866 -0.1154254 -0.11479037\n", + " -0.11533015 -0.11515195 -0.11460571 -0.11563491 -0.11433835 -0.11437413\n", + " -0.11510849 -0.11472156 -0.11516494 -0.11545009 -0.115001 -0.11479743\n", + " -0.11461761 -0.11537461 -0.11497109 -0.1155148 -0.11567353 -0.11431184\n", + " -0.11546067 -0.11462564 -0.11450721 -0.11511 -0.11487988 -0.11466523\n", + " -0.11585756 -0.11462611 -0.11433121 -0.11538152 -0.11463425 -0.11527088\n", + " -0.11509145 -0.11493588 -0.11484324 -0.11528905 -0.11426327 -0.11476508\n", + " -0.11499562 -0.11451299 -0.11466765 -0.11525918 -0.11469718 -0.11476983\n", + " -0.11467865 -0.1145067 -0.11479425 nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " -0.11352917 -0.1145882 -0.11643688 -0.11418115 -0.11442858 -0.11635549\n", + " -0.11408502 -0.11458383 -0.1163013 -0.1135842 -0.11453566 -0.11575264\n", + " -0.11341863 -0.11481638 -0.11635685 -0.1132144 -0.11438018 -0.11666005\n", + " -0.11311482 -0.11500883 -0.11594984 -0.11409228 -0.11464061 -0.1158012\n", + " -0.11389399 -0.11454081 -0.1157428 -0.11333869 -0.11438896 -0.11676006\n", + " -0.11382523 -0.11443669 -0.11606569 -0.11424726 -0.11464652 -0.11608159\n", + " -0.11396605 -0.11473188 -0.1167532 -0.1136805 -0.11455875 -0.11615814\n", + " -0.11372286 -0.11442829 -0.11590895 -0.1136509 -0.11368863 -0.11660073\n", + " -0.1136605 -0.1141187 -0.11613806 -0.11326355 -0.11427399 -0.11676148\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan -0.11573534 -0.11897501 -0.1226239\n", + " -0.1162633 -0.11939573 -0.12255715 -0.11636411 -0.11878021 -0.12306277\n", + " -0.11535113 -0.11813967 -0.1230085 -0.11594119 -0.11812955 -0.12217928\n", + " -0.11523023 -0.11843291 -0.12228252 -0.1159457 -0.11840108 -0.12181337\n", + " -0.11600134 -0.11790484 -0.12203724 -0.11579998 -0.11787918 -0.12317219\n", + " -0.11578704 -0.11837798 -0.12379234 -0.1155279 -0.11865384 -0.12319867\n", + " -0.11597008 -0.11886814 -0.12291788 -0.1162282 -0.11918752 -0.12363613\n", + " -0.11571473 -0.11805225 -0.12250506 -0.11640247 -0.11823175 -0.1226976\n", + " -0.11571549 -0.11813327 -0.12229009 -0.11621545 -0.11793769 -0.1229533\n", + " -0.11528287 -0.1183919 -0.12121653]\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Лучшие гиперпараметры для Gradient Boosting:\n", + "{'learning_rate': 0.1, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}\n" + ] + } + ], + "source": [ + "\n", + "from sklearn.ensemble import GradientBoostingRegressor\n", + "\n", + "param_grid_gb = {\n", + " 'n_estimators': [50, 100, 200],\n", + " 'learning_rate': [0.01, 0.1, 0.2],\n", + " 'max_depth': [3, 5, 7],\n", + " 'min_samples_split': [2, 5, 10],\n", + " 'min_samples_leaf': [1, 2, 4],\n", + " 'max_features': ['auto', 'sqrt', 'log2']\n", + "}\n", + "\n", + "grid_search_gb = GridSearchCV(\n", + " estimator=GradientBoostingRegressor(),\n", + " param_grid=param_grid_gb,\n", + " cv=5,\n", + " scoring='neg_mean_squared_error',\n", + " n_jobs=-1\n", + ")\n", + "\n", + "grid_search_gb.fit(x_train, y_train)\n", + "\n", + "# Вывод лучших гиперпараметров\n", + "print(\"Лучшие гиперпараметры для Gradient Boosting:\")\n", + "print(grid_search_gb.best_params_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3) Метод k-ближайших соседей" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Лучшие гиперпараметры для k-Nearest Neighbors:\n", + "{'algorithm': 'ball_tree', 'n_neighbors': 10, 'p': 1, 'weights': 'distance'}\n" + ] + } + ], + "source": [ + "from sklearn.neighbors import KNeighborsRegressor\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "param_grid_knn = {\n", + " 'n_neighbors': [3, 5, 7, 10],\n", + " 'weights': ['uniform', 'distance'],\n", + " 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],\n", + " 'p': [1, 2]\n", + "}\n", + "\n", + "grid_search_knn = GridSearchCV(\n", + " estimator=KNeighborsRegressor(),\n", + " param_grid=param_grid_knn,\n", + " cv=5,\n", + " scoring='neg_mean_squared_error',\n", + " n_jobs=-1\n", + ")\n", + "\n", + "grid_search_knn.fit(x_train, y_train)\n", + "\n", + "# Вывод лучших гиперпараметров\n", + "print(\"Лучшие гиперпараметры для k-Nearest Neighbors:\")\n", + "print(grid_search_knn.best_params_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Предсказание на тестовой выборке" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = model.predict(x_test)\n", + "y_pred_forest = model_forest.predict(x_test)\n", + "y_pred_lasso = model_lasso.predict(x_test)\n", + "y_pred_gb = model_gb.predict(x_test)\n", + "y_pred_neighbors = model_knn.predict(x_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Оценка качества модели" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1.\tMSE (Mean Squared Error)\n", + "Среднее значение квадратов разностей между предсказанными и фактическими значениями. Чем меньше значение, тем лучше модель." + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean Squared Error (MSE):\n", + "k-NN: \t\t\t0.213\n", + "Random Forest: \t\t0.118\n", + "Lasso: \t\t\t0.166\n", + "Gradient Boosting: \t0.113\n", + "k-Nearest Neighbors: \t0.326\n" + ] + } + ], + "source": [ + "from sklearn.metrics import mean_squared_error\n", + "import numpy as np\n", + "\n", + "mse1 = mean_squared_error(y_test, y_pred)\n", + "mse2 = mean_squared_error(y_test, y_pred_forest)\n", + "mse3 = mean_squared_error(y_test, y_pred_lasso)\n", + "mse4 = mean_squared_error(y_test, y_pred_gb)\n", + "mse5 = mean_squared_error(y_test, y_pred_neighbors)\n", + "\n", + "mse1_rounded = round(mse1, 3)\n", + "mse2_rounded = round(mse2, 3)\n", + "mse3_rounded = round(mse3, 3)\n", + "mse4_rounded = round(mse4, 3)\n", + "mse5_rounded = round(mse5, 3)\n", + "\n", + "print(\"Mean Squared Error (MSE):\")\n", + "print(f\"k-NN: \\t\\t\\t{mse1_rounded}\")\n", + "print(f\"Random Forest: \\t\\t{mse2_rounded}\")\n", + "print(f\"Lasso: \\t\\t\\t{mse3_rounded}\")\n", + "print(f\"Gradient Boosting: \\t{mse4_rounded}\")\n", + "print(f\"k-Nearest Neighbors: \\t{mse5_rounded}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2.\tMAE\n", + "Среднее значение абсолютных разностей между предсказанными и фактическими значениями. Чем меньше значение, тем лучше модель." + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean Absolute Error (MAE):\n", + "k-NN: \t\t\t0.213\n", + "Random Forest: \t\t0.238\n", + "Lasso: \t\t\t0.366\n", + "Gradient Boosting: \t0.246\n", + "k-Nearest Neighbors: \t0.485\n" + ] + } + ], + "source": [ + "from sklearn.metrics import mean_absolute_error\n", + "\n", + "mae1 = round(mean_absolute_error(y_test, y_pred),3)\n", + "mae2 = round(mean_absolute_error(y_test, y_pred_forest),3)\n", + "mae3 = round(mean_absolute_error(y_test, y_pred_lasso),3)\n", + "mae4 = round(mean_absolute_error(y_test, y_pred_gb),3)\n", + "mae5 = round(mean_absolute_error(y_test, y_pred_neighbors),3)\n", + "print(\"Mean Absolute Error (MAE):\")\n", + "print(f\"k-NN: \\t\\t\\t{mae1}\")\n", + "print(f\"Random Forest: \\t\\t{mae2}\")\n", + "print(f\"Lasso: \\t\\t\\t{mae3}\")\n", + "print(f\"Gradient Boosting: \\t{mae4}\")\n", + "print(f\"k-Nearest Neighbors: \\t{mae5}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.\tR-squared\n", + "Мера, показывающая, насколько хорошо модель объясняет изменчивость данных. Значение находится в диапазоне от 0 до 1, где 1 — идеальное соответствие, а 0 — модель не объясняет данные." + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "R² (R-squared): 0.127933821917115\n", + "\n", + "R² (R-squared):\n", + "k-NN: \t\t\t0.128\n", + "Random Forest: \t\t0.515\n", + "Lasso: \t\t\t0.319\n", + "Gradient Boosting: \t0.537\n", + "k-Nearest Neighbors: \t-0.337\n" + ] + } + ], + "source": [ + "from sklearn.metrics import r2_score\n", + "r2 = r2_score(y_test, y_pred)\n", + "print(f\"R² (R-squared): {r2}\")\n", + "\n", + "r2_1 = r2_score(y_test, y_pred)\n", + "r2_2 = r2_score(y_test, y_pred_forest)\n", + "r2_3 = r2_score(y_test, y_pred_lasso)\n", + "r2_4 = r2_score(y_test, y_pred_gb)\n", + "r2_5 = r2_score(y_test, y_pred_neighbors)\n", + "\n", + "r2_1_rounded = round(r2_1, 3)\n", + "r2_2_rounded = round(r2_2, 3)\n", + "r2_3_rounded = round(r2_3, 3)\n", + "r2_4_rounded = round(r2_4, 3)\n", + "r2_5_rounded = round(r2_5, 3)\n", + "\n", + "print(\"\\nR² (R-squared):\")\n", + "print(f\"k-NN: \\t\\t\\t{r2_1_rounded}\")\n", + "print(f\"Random Forest: \\t\\t{r2_2_rounded}\")\n", + "print(f\"Lasso: \\t\\t\\t{r2_3_rounded}\")\n", + "print(f\"Gradient Boosting: \\t{r2_4_rounded}\")\n", + "print(f\"k-Nearest Neighbors: \\t{r2_5_rounded}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "4.\tRMSE\n", + " Среднее отклонение предсказаний от реальных данных. Чем меньше модуль, тем лучше модель." + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Root Mean Squared Error (RMSE):\n", + "k-NN: \t\t\t0.461\n", + "Random Forest: \t\t0.344\n", + "Lasso: \t\t\t0.407\n", + "Gradient Boosting: \t0.336\n", + "k-Nearest Neighbors: \t0.571\n" + ] + } + ], + "source": [ + "rmse1 = np.sqrt(mse1)\n", + "rmse2 = np.sqrt(mse2)\n", + "rmse3 = np.sqrt(mse3)\n", + "rmse4 = np.sqrt(mse4)\n", + "rmse5 = np.sqrt(mse5)\n", + "\n", + "rmse1_rounded = round(rmse1, 3)\n", + "rmse2_rounded = round(rmse2, 3)\n", + "rmse3_rounded = round(rmse3, 3)\n", + "rmse4_rounded = round(rmse4, 3)\n", + "rmse5_rounded = round(rmse5, 3)\n", + "\n", + "print(\"Root Mean Squared Error (RMSE):\")\n", + "print(f\"k-NN: \\t\\t\\t{rmse1_rounded}\")\n", + "print(f\"Random Forest: \\t\\t{rmse2_rounded}\")\n", + "print(f\"Lasso: \\t\\t\\t{rmse3_rounded}\")\n", + "print(f\"Gradient Boosting: \\t{rmse4_rounded}\")\n", + "print(f\"k-Nearest Neighbors: \\t{rmse5_rounded}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Лучший результат – градиентный бустинг и случайный лес.\n", + "Положительные результаты по всем критериям получил случайный лес. Три из четырех положительных результата у градиентного бустинга. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Значит, случайный лес – наиболее точная и устойчивая стратегия обучения модели. Итоговая модель – model_forest." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Также, с помощью применение важности признаков (feature importance) на Случайном лесе, мы вывели основные факторы, вызывающие депрессию:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Feature Importance\n", + "13 Have you ever had suicidal thoughts ? 0.300542\n", + "5 Academic Pressure 0.134276\n", + "0 id 0.087970\n", + "7 CGPA 0.079078\n", + "2 Age 0.066613\n", + "15 Financial Stress 0.066330\n", + "3 City 0.059293\n", + "14 Work/Study Hours 0.052275\n", + "12 Degree 0.049539\n", + "8 Study Satisfaction 0.032944\n", + "11 Dietary Habits 0.026140\n", + "10 Sleep Duration 0.024435\n", + "16 Family History of Mental Illness 0.010547\n", + "1 Gender 0.009627\n", + "4 Profession 0.000372\n", + "9 Job Satisfaction 0.000017\n", + "6 Work Pressure 0.000003\n" + ] + } + ], + "source": [ + "from sklearn.ensemble import RandomForestRegressor\n", + "\n", + "model_rf = RandomForestRegressor(n_estimators=100, random_state=42)\n", + "model_rf.fit(x_train, y_train)\n", + "\n", + "feature_importances = model_rf.feature_importances_\n", + "\n", + "import pandas as pd\n", + "feature_importance_df = pd.DataFrame({\n", + " 'Feature': x.columns,\n", + " 'Importance': feature_importances\n", + "}).sort_values(by='Importance', ascending=False)\n", + "\n", + "print(feature_importance_df)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Scripts", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 9108fe27f63d3c3c82d983563e1a99784893ab05 Mon Sep 17 00:00:00 2001 From: dex_moth Date: Sat, 21 Dec 2024 00:17:24 +0400 Subject: [PATCH 2/7] =?UTF-8?q?=D0=A3=D0=B4=D0=B0=D0=BB=D0=B8=D1=82=D1=8C?= =?UTF-8?q?=20lab=5F4/Lab4.ipynb?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_4/Lab4.ipynb | 911 ----------------------------------------------- 1 file changed, 911 deletions(-) delete mode 100644 lab_4/Lab4.ipynb diff --git a/lab_4/Lab4.ipynb b/lab_4/Lab4.ipynb deleted file mode 100644 index 0b8116e..0000000 --- a/lab_4/Lab4.ipynb +++ /dev/null @@ -1,911 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'Gender', 'Age', 'City', 'Profession', 'Academic Pressure',\n", - " 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',\n", - " 'Sleep Duration', 'Dietary Habits', 'Degree',\n", - " 'Have you ever had suicidal thoughts ?', 'Work/Study Hours',\n", - " 'Financial Stress', 'Family History of Mental Illness', 'Depression'],\n", - " dtype='object')\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "from matplotlib.ticker import FuncFormatter\n", - "\n", - "df = pd.read_csv(\".//csv//Student Depression Dataset.csv\")\n", - "print(df.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " id Gender Age City Profession Academic Pressure \\\n", - "0 2 Male 33.0 Visakhapatnam Student 5.0 \n", - "1 8 Female 24.0 Bangalore Student 2.0 \n", - "2 26 Male 31.0 Srinagar Student 3.0 \n", - "3 30 Female 28.0 Varanasi Student 3.0 \n", - "4 32 Female 25.0 Jaipur Student 4.0 \n", - "\n", - " Work Pressure CGPA Study Satisfaction Job Satisfaction \\\n", - "0 0.0 8.97 2.0 0.0 \n", - "1 0.0 5.90 5.0 0.0 \n", - "2 0.0 7.03 5.0 0.0 \n", - "3 0.0 5.59 2.0 0.0 \n", - "4 0.0 8.13 3.0 0.0 \n", - "\n", - " Sleep Duration Dietary Habits Degree \\\n", - "0 5-6 hours Healthy B.Pharm \n", - "1 5-6 hours Moderate BSc \n", - "2 Less than 5 hours Healthy BA \n", - "3 7-8 hours Moderate BCA \n", - "4 5-6 hours Moderate M.Tech \n", - "\n", - " Have you ever had suicidal thoughts ? Work/Study Hours Financial Stress \\\n", - "0 Yes 3.0 1.0 \n", - "1 No 3.0 2.0 \n", - "2 No 9.0 1.0 \n", - "3 Yes 4.0 5.0 \n", - "4 Yes 1.0 1.0 \n", - "\n", - " Family History of Mental Illness Depression \n", - "0 No 1 \n", - "1 Yes 0 \n", - "2 Yes 0 \n", - "3 Yes 1 \n", - "4 No 0 \n" - ] - } - ], - "source": [ - "print(df.head())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Бизнес-цель исследования\n", - "Разработать и внедрить систему прогнозирования уровня депрессии среди обучающихся, которая позволит выявить группы риска на ранних этапах. Результаты исследования могут быть полезны психологам, педагогам и администрации учебных заведений.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Описание набора данных для анализа\n", - "Набор данных содержит информацию о психологическом состоянии обучающихся и включает следующие поля:\n", - "- id – идентификатор, число\n", - "- Gender – пол, строка\n", - "- Age – возраст, дробное число\n", - "- City – город, строка\n", - "- Profession – профессия, строка\n", - "- Academic Pressure – академическое давление, дробное число (от 1.00 до 5.00)\n", - "- Work Pressure – рабочее давление, дробное число (от 1.00 до 5.00)\n", - "- CGPA – средний балл (GPA), дробное число\n", - "- Study Satisfaction – удовлетворенность учебой, дробное число (от 1.00 до 5.00)\n", - "- Job Satisfaction – удовлетворенность работой, дробное число (от 1.00 до 5.00)\n", - "- Sleep Duration – продолжительность сна, строка\n", - "- Dietary Habits – пищевые привычки, строка\n", - "- Degree – степень (образование), строка\n", - "- Have you ever had suicidal thoughts? – Были ли у вас когда-либо суицидальные мысли? строка (yes/no)\n", - "- Work/Study Hours – часы работы/учебы, дробное число\n", - "- Financial Stress – финансовый стресс, дробное число (от 1.00 до 5.00)\n", - "- Family History of Mental Illness – семейный анамнез психических заболеваний, строка (yes/no)\n", - "- Depression – депрессия, булевое значение (1/0)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Обработка данных" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "id 0\n", - "Gender 0\n", - "Age 0\n", - "City 0\n", - "Profession 0\n", - "Academic Pressure 0\n", - "Work Pressure 0\n", - "CGPA 0\n", - "Study Satisfaction 0\n", - "Job Satisfaction 0\n", - "Sleep Duration 0\n", - "Dietary Habits 0\n", - "Degree 0\n", - "Have you ever had suicidal thoughts ? 0\n", - "Work/Study Hours 0\n", - "Financial Stress 3\n", - "Family History of Mental Illness 0\n", - "Depression 0\n", - "dtype: int64" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.isnull().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "df.dropna(subset=['Financial Stress'], inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "features = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', \n", - " 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress', 'Depression']\n", - "\n", - "plt.figure(figsize=(15, 10))\n", - "for i, feature in enumerate(features, 1):\n", - " plt.subplot(3, 3, i)\n", - " sns.boxplot(y=df[feature], color='skyblue')\n", - " plt.title(f'Boxplot of {feature}')\n", - " plt.ylabel(feature)\n", - "\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "В Age много выбросов. Сбалансируем данные" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "Q1 = df['Age'].quantile(0.25)\n", - "Q3 = df['Age'].quantile(0.75)\n", - "IQR = Q3 - Q1\n", - "\n", - "threshold = 1.5 * IQR\n", - "outliers = (df['Age'] < (Q1 - threshold)) | (df['Age'] > (Q3 + threshold))\n", - "\n", - "median_rating = df['Age'].median()\n", - "df.loc[outliers, 'Age'] = median_rating\n", - "\n", - "plt.figure(figsize=(8, 6))\n", - "sns.boxplot(y=df['Age'], color='skyblue')\n", - "plt.title('Boxplot of Age')\n", - "plt.ylabel('Age')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Конструирование признаков с помощью меток" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.preprocessing import LabelEncoder\n", - "\n", - "le = LabelEncoder()\n", - "df['Gender'] = le.fit_transform(df['Gender'])\n", - "df['City'] = le.fit_transform(df['City'])\n", - "df['Dietary Habits'] = le.fit_transform(df['Dietary Habits'])\n", - "df['Degree'] = le.fit_transform(df['Degree'])\n", - "df['Have you ever had suicidal thoughts ?'] = le.fit_transform(df['Have you ever had suicidal thoughts ?'])\n", - "df['Sleep Duration'] = le.fit_transform(df['Sleep Duration'])\n", - "df['Profession'] = le.fit_transform(df['Profession'])\n", - "df['Study Satisfaction'] = le.fit_transform(df['Study Satisfaction'])\n", - "df['Family History of Mental Illness'] = le.fit_transform(df['Family History of Mental Illness'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "разделение на признаки и целевую переменную" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "x = df.drop('Depression', axis=1)\n", - "y = df['Depression']" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1) Метод регрессии Лассо\n" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Лучшие гиперпараметры для Lasso:\n", - "{'alpha': 0.01, 'fit_intercept': False}\n" - ] - } - ], - "source": [ - "from sklearn.linear_model import Lasso\n", - "\n", - "param_grid_lasso = {\n", - " 'alpha': [0.01, 0.1, 1.0, 10.0],\n", - " 'fit_intercept': [True, False],\n", - "}\n", - "\n", - "# Создание объекта GridSearchCV\n", - "grid_search_lasso = GridSearchCV(\n", - " estimator=Lasso(), \n", - " param_grid=param_grid_lasso, \n", - " cv=5, \n", - " scoring='neg_mean_squared_error', \n", - " n_jobs=-1 \n", - ")\n", - "\n", - "grid_search_lasso.fit(x_train, y_train)\n", - "\n", - "# Вывод лучших гиперпараметров\n", - "print(\"Лучшие гиперпараметры для Lasso:\")\n", - "print(grid_search_lasso.best_params_)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2) Метод градиентного бустинга" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:540: FitFailedWarning: \n", - "1215 fits failed out of a total of 3645.\n", - "The score on these train-test partitions for these parameters will be set to nan.\n", - "If these failures are not expected, you can try to debug them by setting error_score='raise'.\n", - "\n", - "Below are more details about the failures:\n", - "--------------------------------------------------------------------------------\n", - "978 fits failed with the following error:\n", - "Traceback (most recent call last):\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 888, in _fit_and_score\n", - " estimator.fit(X_train, y_train, **fit_params)\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 1466, in wrapper\n", - " estimator._validate_params()\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 666, in _validate_params\n", - " validate_parameter_constraints(\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py\", line 95, in validate_parameter_constraints\n", - " raise InvalidParameterError(\n", - "sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of GradientBoostingRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto' instead.\n", - "\n", - "--------------------------------------------------------------------------------\n", - "237 fits failed with the following error:\n", - "Traceback (most recent call last):\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 888, in _fit_and_score\n", - " estimator.fit(X_train, y_train, **fit_params)\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 1466, in wrapper\n", - " estimator._validate_params()\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 666, in _validate_params\n", - " validate_parameter_constraints(\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py\", line 95, in validate_parameter_constraints\n", - " raise InvalidParameterError(\n", - "sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of GradientBoostingRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.\n", - "\n", - " warnings.warn(some_fits_failed_message, FitFailedWarning)\n", - "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\numpy\\ma\\core.py:2881: RuntimeWarning: invalid value encountered in cast\n", - " _data = np.array(data, dtype=dtype, copy=copy,\n", - "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:1103: UserWarning: One or more of the test scores are non-finite: [ nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan -0.18767441 -0.15799837 -0.13080278\n", - " -0.18762913 -0.15792709 -0.13056114 -0.18792038 -0.15737146 -0.130218\n", - " -0.18725961 -0.157967 -0.13047453 -0.18766583 -0.15779565 -0.13094863\n", - " -0.18798705 -0.15693978 -0.13061215 -0.18766317 -0.15746848 -0.13072918\n", - " -0.18864158 -0.15666133 -0.13095037 -0.18817206 -0.15805489 -0.13086126\n", - " -0.18707465 -0.15864932 -0.13104947 -0.18818902 -0.15828572 -0.13063871\n", - " -0.18701628 -0.15853864 -0.13019458 -0.18740927 -0.15836397 -0.13065455\n", - " -0.18768748 -0.15828297 -0.1309458 -0.18845004 -0.15696395 -0.13023062\n", - " -0.18754854 -0.15899615 -0.13061707 -0.18831427 -0.15819939 -0.13096524\n", - " -0.18662963 -0.15815869 -0.13089186 nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " -0.1758914 -0.1442684 -0.12093344 -0.1758927 -0.14423731 -0.12084543\n", - " -0.17573339 -0.14419842 -0.12076166 -0.17512045 -0.14435454 -0.1207299\n", - " -0.17669645 -0.14397965 -0.12087019 -0.17605424 -0.1438664 -0.12091068\n", - " -0.17582192 -0.1443651 -0.12097165 -0.17588422 -0.14421003 -0.12081764\n", - " -0.17522742 -0.14424357 -0.12086484 -0.17530986 -0.14433713 -0.12091757\n", - " -0.17565647 -0.14408902 -0.12075918 -0.17561884 -0.14426355 -0.12094066\n", - " -0.17522371 -0.1439869 -0.12099023 -0.17619772 -0.14396131 -0.12079667\n", - " -0.17710789 -0.1448419 -0.12087822 -0.17608534 -0.14416684 -0.12087865\n", - " -0.1754675 -0.1442258 -0.12068226 -0.17611334 -0.14433552 -0.12093556\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan -0.16938321 -0.13763002 -0.11703902\n", - " -0.16953091 -0.13736586 -0.11695779 -0.16881837 -0.1375676 -0.11694438\n", - " -0.16927898 -0.13748177 -0.11689982 -0.16921265 -0.13757375 -0.11682524\n", - " -0.16915872 -0.13727377 -0.11694336 -0.16939766 -0.13734972 -0.1167447\n", - " -0.16924214 -0.1373768 -0.11674816 -0.16918278 -0.13746085 -0.1169816\n", - " -0.16927003 -0.13740063 -0.1169564 -0.16916501 -0.13752074 -0.11687641\n", - " -0.16928973 -0.13751536 -0.11697948 -0.16934836 -0.13727436 -0.11693615\n", - " -0.16912453 -0.13748699 -0.11693425 -0.1692788 -0.13750784 -0.11694655\n", - " -0.16919354 -0.13747437 -0.11708782 -0.16940009 -0.13757749 -0.11700586\n", - " -0.1692801 -0.13725384 -0.11684394 nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " -0.11606052 -0.1140225 -0.11403709 -0.11627212 -0.1139982 -0.11402075\n", - " -0.11613561 -0.11407941 -0.11420487 -0.11666225 -0.11462523 -0.11431901\n", - " -0.11604817 -0.11456211 -0.11392092 -0.11609343 -0.11394228 -0.11414071\n", - " -0.11611685 -0.11420178 -0.11405459 -0.11594404 -0.11408614 -0.11391662\n", - " -0.11590886 -0.11396465 -0.11389125 -0.11616694 -0.11441846 -0.11417015\n", - " -0.11617368 -0.11429765 -0.1139636 -0.11616763 -0.11433984 -0.11412121\n", - " -0.11625618 -0.11402999 -0.11419791 -0.11613603 -0.114206 -0.11423922\n", - " -0.1160801 -0.11431896 -0.11416734 -0.11608923 -0.11455498 -0.11417448\n", - " -0.11605165 -0.11427773 -0.11392205 -0.11606243 -0.11408421 -0.11395292\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan -0.11281447 -0.11245904 -0.11308822\n", - " -0.11256366 -0.11230094 -0.1130767 -0.11282651 -0.1121034 -0.11283479\n", - " -0.11260704 -0.1125136 -0.11288977 -0.11278304 -0.11242278 -0.11268564\n", - " -0.11263359 -0.11236227 -0.11329411 -0.11231603 -0.1124533 -0.11278826\n", - " -0.11291545 -0.11241223 -0.11250702 -0.11246481 -0.11228665 -0.11348916\n", - " -0.11250694 -0.11250274 -0.11298019 -0.11277323 -0.11248601 -0.11301753\n", - " -0.11259486 -0.1124685 -0.11285441 -0.11274424 -0.11232891 -0.11316456\n", - " -0.11274575 -0.11256149 -0.11252293 -0.11293524 -0.11261757 -0.11305628\n", - " -0.11253063 -0.11237109 -0.11278518 -0.1124074 -0.11276905 -0.11296684\n", - " -0.11258689 -0.11228467 -0.11331342 nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " -0.11292265 -0.11395193 -0.11564599 -0.11244356 -0.11338947 -0.1148266\n", - " -0.11295702 -0.11353862 -0.11510521 -0.11244347 -0.11387967 -0.11512396\n", - " -0.11269802 -0.11364442 -0.1151339 -0.11238356 -0.11364301 -0.11496543\n", - " -0.11229193 -0.11340926 -0.11550744 -0.11215818 -0.11367944 -0.11552889\n", - " -0.11240305 -0.11352309 -0.115412 -0.1128402 -0.11338749 -0.1153551\n", - " -0.11250042 -0.11347275 -0.11548445 -0.11271132 -0.11377527 -0.11558066\n", - " -0.11318598 -0.11325792 -0.11499103 -0.11253099 -0.1129829 -0.11530949\n", - " -0.11239074 -0.11329625 -0.11544761 -0.11262484 -0.11323392 -0.1151936\n", - " -0.11253889 -0.11382403 -0.11511129 -0.11250854 -0.11339898 -0.11536332\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan -0.11542253 -0.11498664 -0.11428517\n", - " -0.11503783 -0.11473447 -0.11458687 -0.11483866 -0.1154254 -0.11479037\n", - " -0.11533015 -0.11515195 -0.11460571 -0.11563491 -0.11433835 -0.11437413\n", - " -0.11510849 -0.11472156 -0.11516494 -0.11545009 -0.115001 -0.11479743\n", - " -0.11461761 -0.11537461 -0.11497109 -0.1155148 -0.11567353 -0.11431184\n", - " -0.11546067 -0.11462564 -0.11450721 -0.11511 -0.11487988 -0.11466523\n", - " -0.11585756 -0.11462611 -0.11433121 -0.11538152 -0.11463425 -0.11527088\n", - " -0.11509145 -0.11493588 -0.11484324 -0.11528905 -0.11426327 -0.11476508\n", - " -0.11499562 -0.11451299 -0.11466765 -0.11525918 -0.11469718 -0.11476983\n", - " -0.11467865 -0.1145067 -0.11479425 nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " -0.11352917 -0.1145882 -0.11643688 -0.11418115 -0.11442858 -0.11635549\n", - " -0.11408502 -0.11458383 -0.1163013 -0.1135842 -0.11453566 -0.11575264\n", - " -0.11341863 -0.11481638 -0.11635685 -0.1132144 -0.11438018 -0.11666005\n", - " -0.11311482 -0.11500883 -0.11594984 -0.11409228 -0.11464061 -0.1158012\n", - " -0.11389399 -0.11454081 -0.1157428 -0.11333869 -0.11438896 -0.11676006\n", - " -0.11382523 -0.11443669 -0.11606569 -0.11424726 -0.11464652 -0.11608159\n", - " -0.11396605 -0.11473188 -0.1167532 -0.1136805 -0.11455875 -0.11615814\n", - " -0.11372286 -0.11442829 -0.11590895 -0.1136509 -0.11368863 -0.11660073\n", - " -0.1136605 -0.1141187 -0.11613806 -0.11326355 -0.11427399 -0.11676148\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan -0.11573534 -0.11897501 -0.1226239\n", - " -0.1162633 -0.11939573 -0.12255715 -0.11636411 -0.11878021 -0.12306277\n", - " -0.11535113 -0.11813967 -0.1230085 -0.11594119 -0.11812955 -0.12217928\n", - " -0.11523023 -0.11843291 -0.12228252 -0.1159457 -0.11840108 -0.12181337\n", - " -0.11600134 -0.11790484 -0.12203724 -0.11579998 -0.11787918 -0.12317219\n", - " -0.11578704 -0.11837798 -0.12379234 -0.1155279 -0.11865384 -0.12319867\n", - " -0.11597008 -0.11886814 -0.12291788 -0.1162282 -0.11918752 -0.12363613\n", - " -0.11571473 -0.11805225 -0.12250506 -0.11640247 -0.11823175 -0.1226976\n", - " -0.11571549 -0.11813327 -0.12229009 -0.11621545 -0.11793769 -0.1229533\n", - " -0.11528287 -0.1183919 -0.12121653]\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Лучшие гиперпараметры для Gradient Boosting:\n", - "{'learning_rate': 0.1, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}\n" - ] - } - ], - "source": [ - "\n", - "from sklearn.ensemble import GradientBoostingRegressor\n", - "\n", - "param_grid_gb = {\n", - " 'n_estimators': [50, 100, 200],\n", - " 'learning_rate': [0.01, 0.1, 0.2],\n", - " 'max_depth': [3, 5, 7],\n", - " 'min_samples_split': [2, 5, 10],\n", - " 'min_samples_leaf': [1, 2, 4],\n", - " 'max_features': ['auto', 'sqrt', 'log2']\n", - "}\n", - "\n", - "grid_search_gb = GridSearchCV(\n", - " estimator=GradientBoostingRegressor(),\n", - " param_grid=param_grid_gb,\n", - " cv=5,\n", - " scoring='neg_mean_squared_error',\n", - " n_jobs=-1\n", - ")\n", - "\n", - "grid_search_gb.fit(x_train, y_train)\n", - "\n", - "# Вывод лучших гиперпараметров\n", - "print(\"Лучшие гиперпараметры для Gradient Boosting:\")\n", - "print(grid_search_gb.best_params_)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 3) Метод k-ближайших соседей" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Лучшие гиперпараметры для k-Nearest Neighbors:\n", - "{'algorithm': 'ball_tree', 'n_neighbors': 10, 'p': 1, 'weights': 'distance'}\n" - ] - } - ], - "source": [ - "from sklearn.neighbors import KNeighborsRegressor\n", - "from sklearn.model_selection import GridSearchCV\n", - "\n", - "param_grid_knn = {\n", - " 'n_neighbors': [3, 5, 7, 10],\n", - " 'weights': ['uniform', 'distance'],\n", - " 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],\n", - " 'p': [1, 2]\n", - "}\n", - "\n", - "grid_search_knn = GridSearchCV(\n", - " estimator=KNeighborsRegressor(),\n", - " param_grid=param_grid_knn,\n", - " cv=5,\n", - " scoring='neg_mean_squared_error',\n", - " n_jobs=-1\n", - ")\n", - "\n", - "grid_search_knn.fit(x_train, y_train)\n", - "\n", - "# Вывод лучших гиперпараметров\n", - "print(\"Лучшие гиперпараметры для k-Nearest Neighbors:\")\n", - "print(grid_search_knn.best_params_)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Предсказание на тестовой выборке" - ] - }, - { - "cell_type": "code", - "execution_count": 128, - "metadata": {}, - "outputs": [], - "source": [ - "y_pred = model.predict(x_test)\n", - "y_pred_forest = model_forest.predict(x_test)\n", - "y_pred_lasso = model_lasso.predict(x_test)\n", - "y_pred_gb = model_gb.predict(x_test)\n", - "y_pred_neighbors = model_knn.predict(x_test)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Оценка качества модели" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "1.\tMSE (Mean Squared Error)\n", - "Среднее значение квадратов разностей между предсказанными и фактическими значениями. Чем меньше значение, тем лучше модель." - ] - }, - { - "cell_type": "code", - "execution_count": 156, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Mean Squared Error (MSE):\n", - "k-NN: \t\t\t0.213\n", - "Random Forest: \t\t0.118\n", - "Lasso: \t\t\t0.166\n", - "Gradient Boosting: \t0.113\n", - "k-Nearest Neighbors: \t0.326\n" - ] - } - ], - "source": [ - "from sklearn.metrics import mean_squared_error\n", - "import numpy as np\n", - "\n", - "mse1 = mean_squared_error(y_test, y_pred)\n", - "mse2 = mean_squared_error(y_test, y_pred_forest)\n", - "mse3 = mean_squared_error(y_test, y_pred_lasso)\n", - "mse4 = mean_squared_error(y_test, y_pred_gb)\n", - "mse5 = mean_squared_error(y_test, y_pred_neighbors)\n", - "\n", - "mse1_rounded = round(mse1, 3)\n", - "mse2_rounded = round(mse2, 3)\n", - "mse3_rounded = round(mse3, 3)\n", - "mse4_rounded = round(mse4, 3)\n", - "mse5_rounded = round(mse5, 3)\n", - "\n", - "print(\"Mean Squared Error (MSE):\")\n", - "print(f\"k-NN: \\t\\t\\t{mse1_rounded}\")\n", - "print(f\"Random Forest: \\t\\t{mse2_rounded}\")\n", - "print(f\"Lasso: \\t\\t\\t{mse3_rounded}\")\n", - "print(f\"Gradient Boosting: \\t{mse4_rounded}\")\n", - "print(f\"k-Nearest Neighbors: \\t{mse5_rounded}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "2.\tMAE\n", - "Среднее значение абсолютных разностей между предсказанными и фактическими значениями. Чем меньше значение, тем лучше модель." - ] - }, - { - "cell_type": "code", - "execution_count": 155, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Mean Absolute Error (MAE):\n", - "k-NN: \t\t\t0.213\n", - "Random Forest: \t\t0.238\n", - "Lasso: \t\t\t0.366\n", - "Gradient Boosting: \t0.246\n", - "k-Nearest Neighbors: \t0.485\n" - ] - } - ], - "source": [ - "from sklearn.metrics import mean_absolute_error\n", - "\n", - "mae1 = round(mean_absolute_error(y_test, y_pred),3)\n", - "mae2 = round(mean_absolute_error(y_test, y_pred_forest),3)\n", - "mae3 = round(mean_absolute_error(y_test, y_pred_lasso),3)\n", - "mae4 = round(mean_absolute_error(y_test, y_pred_gb),3)\n", - "mae5 = round(mean_absolute_error(y_test, y_pred_neighbors),3)\n", - "print(\"Mean Absolute Error (MAE):\")\n", - "print(f\"k-NN: \\t\\t\\t{mae1}\")\n", - "print(f\"Random Forest: \\t\\t{mae2}\")\n", - "print(f\"Lasso: \\t\\t\\t{mae3}\")\n", - "print(f\"Gradient Boosting: \\t{mae4}\")\n", - "print(f\"k-Nearest Neighbors: \\t{mae5}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "3.\tR-squared\n", - "Мера, показывающая, насколько хорошо модель объясняет изменчивость данных. Значение находится в диапазоне от 0 до 1, где 1 — идеальное соответствие, а 0 — модель не объясняет данные." - ] - }, - { - "cell_type": "code", - "execution_count": 153, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "R² (R-squared): 0.127933821917115\n", - "\n", - "R² (R-squared):\n", - "k-NN: \t\t\t0.128\n", - "Random Forest: \t\t0.515\n", - "Lasso: \t\t\t0.319\n", - "Gradient Boosting: \t0.537\n", - "k-Nearest Neighbors: \t-0.337\n" - ] - } - ], - "source": [ - "from sklearn.metrics import r2_score\n", - "r2 = r2_score(y_test, y_pred)\n", - "print(f\"R² (R-squared): {r2}\")\n", - "\n", - "r2_1 = r2_score(y_test, y_pred)\n", - "r2_2 = r2_score(y_test, y_pred_forest)\n", - "r2_3 = r2_score(y_test, y_pred_lasso)\n", - "r2_4 = r2_score(y_test, y_pred_gb)\n", - "r2_5 = r2_score(y_test, y_pred_neighbors)\n", - "\n", - "r2_1_rounded = round(r2_1, 3)\n", - "r2_2_rounded = round(r2_2, 3)\n", - "r2_3_rounded = round(r2_3, 3)\n", - "r2_4_rounded = round(r2_4, 3)\n", - "r2_5_rounded = round(r2_5, 3)\n", - "\n", - "print(\"\\nR² (R-squared):\")\n", - "print(f\"k-NN: \\t\\t\\t{r2_1_rounded}\")\n", - "print(f\"Random Forest: \\t\\t{r2_2_rounded}\")\n", - "print(f\"Lasso: \\t\\t\\t{r2_3_rounded}\")\n", - "print(f\"Gradient Boosting: \\t{r2_4_rounded}\")\n", - "print(f\"k-Nearest Neighbors: \\t{r2_5_rounded}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "4.\tRMSE\n", - " Среднее отклонение предсказаний от реальных данных. Чем меньше модуль, тем лучше модель." - ] - }, - { - "cell_type": "code", - "execution_count": 151, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Root Mean Squared Error (RMSE):\n", - "k-NN: \t\t\t0.461\n", - "Random Forest: \t\t0.344\n", - "Lasso: \t\t\t0.407\n", - "Gradient Boosting: \t0.336\n", - "k-Nearest Neighbors: \t0.571\n" - ] - } - ], - "source": [ - "rmse1 = np.sqrt(mse1)\n", - "rmse2 = np.sqrt(mse2)\n", - "rmse3 = np.sqrt(mse3)\n", - "rmse4 = np.sqrt(mse4)\n", - "rmse5 = np.sqrt(mse5)\n", - "\n", - "rmse1_rounded = round(rmse1, 3)\n", - "rmse2_rounded = round(rmse2, 3)\n", - "rmse3_rounded = round(rmse3, 3)\n", - "rmse4_rounded = round(rmse4, 3)\n", - "rmse5_rounded = round(rmse5, 3)\n", - "\n", - "print(\"Root Mean Squared Error (RMSE):\")\n", - "print(f\"k-NN: \\t\\t\\t{rmse1_rounded}\")\n", - "print(f\"Random Forest: \\t\\t{rmse2_rounded}\")\n", - "print(f\"Lasso: \\t\\t\\t{rmse3_rounded}\")\n", - "print(f\"Gradient Boosting: \\t{rmse4_rounded}\")\n", - "print(f\"k-Nearest Neighbors: \\t{rmse5_rounded}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Лучший результат – градиентный бустинг и случайный лес.\n", - "Положительные результаты по всем критериям получил случайный лес. Три из четырех положительных результата у градиентного бустинга. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Значит, случайный лес – наиболее точная и устойчивая стратегия обучения модели. Итоговая модель – model_forest." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Также, с помощью применение важности признаков (feature importance) на Случайном лесе, мы вывели основные факторы, вызывающие депрессию:" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Feature Importance\n", - "13 Have you ever had suicidal thoughts ? 0.300542\n", - "5 Academic Pressure 0.134276\n", - "0 id 0.087970\n", - "7 CGPA 0.079078\n", - "2 Age 0.066613\n", - "15 Financial Stress 0.066330\n", - "3 City 0.059293\n", - "14 Work/Study Hours 0.052275\n", - "12 Degree 0.049539\n", - "8 Study Satisfaction 0.032944\n", - "11 Dietary Habits 0.026140\n", - "10 Sleep Duration 0.024435\n", - "16 Family History of Mental Illness 0.010547\n", - "1 Gender 0.009627\n", - "4 Profession 0.000372\n", - "9 Job Satisfaction 0.000017\n", - "6 Work Pressure 0.000003\n" - ] - } - ], - "source": [ - "from sklearn.ensemble import RandomForestRegressor\n", - "\n", - "model_rf = RandomForestRegressor(n_estimators=100, random_state=42)\n", - "model_rf.fit(x_train, y_train)\n", - "\n", - "feature_importances = model_rf.feature_importances_\n", - "\n", - "import pandas as pd\n", - "feature_importance_df = pd.DataFrame({\n", - " 'Feature': x.columns,\n", - " 'Importance': feature_importances\n", - "}).sort_values(by='Importance', ascending=False)\n", - "\n", - "print(feature_importance_df)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Scripts", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 1529fc881f7b73130e848d1d7bbc78f570edb621 Mon Sep 17 00:00:00 2001 From: dex_moth Date: Sat, 21 Dec 2024 00:19:22 +0400 Subject: [PATCH 3/7] lab_4 --- lab_4/Lab4.ipynb | 911 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 911 insertions(+) create mode 100644 lab_4/Lab4.ipynb diff --git a/lab_4/Lab4.ipynb b/lab_4/Lab4.ipynb new file mode 100644 index 0000000..0b8116e --- /dev/null +++ b/lab_4/Lab4.ipynb @@ -0,0 +1,911 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['id', 'Gender', 'Age', 'City', 'Profession', 'Academic Pressure',\n", + " 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',\n", + " 'Sleep Duration', 'Dietary Habits', 'Degree',\n", + " 'Have you ever had suicidal thoughts ?', 'Work/Study Hours',\n", + " 'Financial Stress', 'Family History of Mental Illness', 'Depression'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from matplotlib.ticker import FuncFormatter\n", + "\n", + "df = pd.read_csv(\".//csv//Student Depression Dataset.csv\")\n", + "print(df.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " id Gender Age City Profession Academic Pressure \\\n", + "0 2 Male 33.0 Visakhapatnam Student 5.0 \n", + "1 8 Female 24.0 Bangalore Student 2.0 \n", + "2 26 Male 31.0 Srinagar Student 3.0 \n", + "3 30 Female 28.0 Varanasi Student 3.0 \n", + "4 32 Female 25.0 Jaipur Student 4.0 \n", + "\n", + " Work Pressure CGPA Study Satisfaction Job Satisfaction \\\n", + "0 0.0 8.97 2.0 0.0 \n", + "1 0.0 5.90 5.0 0.0 \n", + "2 0.0 7.03 5.0 0.0 \n", + "3 0.0 5.59 2.0 0.0 \n", + "4 0.0 8.13 3.0 0.0 \n", + "\n", + " Sleep Duration Dietary Habits Degree \\\n", + "0 5-6 hours Healthy B.Pharm \n", + "1 5-6 hours Moderate BSc \n", + "2 Less than 5 hours Healthy BA \n", + "3 7-8 hours Moderate BCA \n", + "4 5-6 hours Moderate M.Tech \n", + "\n", + " Have you ever had suicidal thoughts ? Work/Study Hours Financial Stress \\\n", + "0 Yes 3.0 1.0 \n", + "1 No 3.0 2.0 \n", + "2 No 9.0 1.0 \n", + "3 Yes 4.0 5.0 \n", + "4 Yes 1.0 1.0 \n", + "\n", + " Family History of Mental Illness Depression \n", + "0 No 1 \n", + "1 Yes 0 \n", + "2 Yes 0 \n", + "3 Yes 1 \n", + "4 No 0 \n" + ] + } + ], + "source": [ + "print(df.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Бизнес-цель исследования\n", + "Разработать и внедрить систему прогнозирования уровня депрессии среди обучающихся, которая позволит выявить группы риска на ранних этапах. Результаты исследования могут быть полезны психологам, педагогам и администрации учебных заведений.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Описание набора данных для анализа\n", + "Набор данных содержит информацию о психологическом состоянии обучающихся и включает следующие поля:\n", + "- id – идентификатор, число\n", + "- Gender – пол, строка\n", + "- Age – возраст, дробное число\n", + "- City – город, строка\n", + "- Profession – профессия, строка\n", + "- Academic Pressure – академическое давление, дробное число (от 1.00 до 5.00)\n", + "- Work Pressure – рабочее давление, дробное число (от 1.00 до 5.00)\n", + "- CGPA – средний балл (GPA), дробное число\n", + "- Study Satisfaction – удовлетворенность учебой, дробное число (от 1.00 до 5.00)\n", + "- Job Satisfaction – удовлетворенность работой, дробное число (от 1.00 до 5.00)\n", + "- Sleep Duration – продолжительность сна, строка\n", + "- Dietary Habits – пищевые привычки, строка\n", + "- Degree – степень (образование), строка\n", + "- Have you ever had suicidal thoughts? – Были ли у вас когда-либо суицидальные мысли? строка (yes/no)\n", + "- Work/Study Hours – часы работы/учебы, дробное число\n", + "- Financial Stress – финансовый стресс, дробное число (от 1.00 до 5.00)\n", + "- Family History of Mental Illness – семейный анамнез психических заболеваний, строка (yes/no)\n", + "- Depression – депрессия, булевое значение (1/0)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Обработка данных" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "Gender 0\n", + "Age 0\n", + "City 0\n", + "Profession 0\n", + "Academic Pressure 0\n", + "Work Pressure 0\n", + "CGPA 0\n", + "Study Satisfaction 0\n", + "Job Satisfaction 0\n", + "Sleep Duration 0\n", + "Dietary Habits 0\n", + "Degree 0\n", + "Have you ever had suicidal thoughts ? 0\n", + "Work/Study Hours 0\n", + "Financial Stress 3\n", + "Family History of Mental Illness 0\n", + "Depression 0\n", + "dtype: int64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "df.dropna(subset=['Financial Stress'], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "features = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', \n", + " 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress', 'Depression']\n", + "\n", + "plt.figure(figsize=(15, 10))\n", + "for i, feature in enumerate(features, 1):\n", + " plt.subplot(3, 3, i)\n", + " sns.boxplot(y=df[feature], color='skyblue')\n", + " plt.title(f'Boxplot of {feature}')\n", + " plt.ylabel(feature)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В Age много выбросов. Сбалансируем данные" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "Q1 = df['Age'].quantile(0.25)\n", + "Q3 = df['Age'].quantile(0.75)\n", + "IQR = Q3 - Q1\n", + "\n", + "threshold = 1.5 * IQR\n", + "outliers = (df['Age'] < (Q1 - threshold)) | (df['Age'] > (Q3 + threshold))\n", + "\n", + "median_rating = df['Age'].median()\n", + "df.loc[outliers, 'Age'] = median_rating\n", + "\n", + "plt.figure(figsize=(8, 6))\n", + "sns.boxplot(y=df['Age'], color='skyblue')\n", + "plt.title('Boxplot of Age')\n", + "plt.ylabel('Age')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Конструирование признаков с помощью меток" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "le = LabelEncoder()\n", + "df['Gender'] = le.fit_transform(df['Gender'])\n", + "df['City'] = le.fit_transform(df['City'])\n", + "df['Dietary Habits'] = le.fit_transform(df['Dietary Habits'])\n", + "df['Degree'] = le.fit_transform(df['Degree'])\n", + "df['Have you ever had suicidal thoughts ?'] = le.fit_transform(df['Have you ever had suicidal thoughts ?'])\n", + "df['Sleep Duration'] = le.fit_transform(df['Sleep Duration'])\n", + "df['Profession'] = le.fit_transform(df['Profession'])\n", + "df['Study Satisfaction'] = le.fit_transform(df['Study Satisfaction'])\n", + "df['Family History of Mental Illness'] = le.fit_transform(df['Family History of Mental Illness'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "разделение на признаки и целевую переменную" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "x = df.drop('Depression', axis=1)\n", + "y = df['Depression']" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1) Метод регрессии Лассо\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Лучшие гиперпараметры для Lasso:\n", + "{'alpha': 0.01, 'fit_intercept': False}\n" + ] + } + ], + "source": [ + "from sklearn.linear_model import Lasso\n", + "\n", + "param_grid_lasso = {\n", + " 'alpha': [0.01, 0.1, 1.0, 10.0],\n", + " 'fit_intercept': [True, False],\n", + "}\n", + "\n", + "# Создание объекта GridSearchCV\n", + "grid_search_lasso = GridSearchCV(\n", + " estimator=Lasso(), \n", + " param_grid=param_grid_lasso, \n", + " cv=5, \n", + " scoring='neg_mean_squared_error', \n", + " n_jobs=-1 \n", + ")\n", + "\n", + "grid_search_lasso.fit(x_train, y_train)\n", + "\n", + "# Вывод лучших гиперпараметров\n", + "print(\"Лучшие гиперпараметры для Lasso:\")\n", + "print(grid_search_lasso.best_params_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2) Метод градиентного бустинга" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:540: FitFailedWarning: \n", + "1215 fits failed out of a total of 3645.\n", + "The score on these train-test partitions for these parameters will be set to nan.\n", + "If these failures are not expected, you can try to debug them by setting error_score='raise'.\n", + "\n", + "Below are more details about the failures:\n", + "--------------------------------------------------------------------------------\n", + "978 fits failed with the following error:\n", + "Traceback (most recent call last):\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 888, in _fit_and_score\n", + " estimator.fit(X_train, y_train, **fit_params)\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 1466, in wrapper\n", + " estimator._validate_params()\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 666, in _validate_params\n", + " validate_parameter_constraints(\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py\", line 95, in validate_parameter_constraints\n", + " raise InvalidParameterError(\n", + "sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of GradientBoostingRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto' instead.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "237 fits failed with the following error:\n", + "Traceback (most recent call last):\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 888, in _fit_and_score\n", + " estimator.fit(X_train, y_train, **fit_params)\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 1466, in wrapper\n", + " estimator._validate_params()\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 666, in _validate_params\n", + " validate_parameter_constraints(\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py\", line 95, in validate_parameter_constraints\n", + " raise InvalidParameterError(\n", + "sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of GradientBoostingRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.\n", + "\n", + " warnings.warn(some_fits_failed_message, FitFailedWarning)\n", + "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\numpy\\ma\\core.py:2881: RuntimeWarning: invalid value encountered in cast\n", + " _data = np.array(data, dtype=dtype, copy=copy,\n", + "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:1103: UserWarning: One or more of the test scores are non-finite: [ nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan -0.18767441 -0.15799837 -0.13080278\n", + " -0.18762913 -0.15792709 -0.13056114 -0.18792038 -0.15737146 -0.130218\n", + " -0.18725961 -0.157967 -0.13047453 -0.18766583 -0.15779565 -0.13094863\n", + " -0.18798705 -0.15693978 -0.13061215 -0.18766317 -0.15746848 -0.13072918\n", + " -0.18864158 -0.15666133 -0.13095037 -0.18817206 -0.15805489 -0.13086126\n", + " -0.18707465 -0.15864932 -0.13104947 -0.18818902 -0.15828572 -0.13063871\n", + " -0.18701628 -0.15853864 -0.13019458 -0.18740927 -0.15836397 -0.13065455\n", + " -0.18768748 -0.15828297 -0.1309458 -0.18845004 -0.15696395 -0.13023062\n", + " -0.18754854 -0.15899615 -0.13061707 -0.18831427 -0.15819939 -0.13096524\n", + " -0.18662963 -0.15815869 -0.13089186 nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " -0.1758914 -0.1442684 -0.12093344 -0.1758927 -0.14423731 -0.12084543\n", + " -0.17573339 -0.14419842 -0.12076166 -0.17512045 -0.14435454 -0.1207299\n", + " -0.17669645 -0.14397965 -0.12087019 -0.17605424 -0.1438664 -0.12091068\n", + " -0.17582192 -0.1443651 -0.12097165 -0.17588422 -0.14421003 -0.12081764\n", + " -0.17522742 -0.14424357 -0.12086484 -0.17530986 -0.14433713 -0.12091757\n", + " -0.17565647 -0.14408902 -0.12075918 -0.17561884 -0.14426355 -0.12094066\n", + " -0.17522371 -0.1439869 -0.12099023 -0.17619772 -0.14396131 -0.12079667\n", + " -0.17710789 -0.1448419 -0.12087822 -0.17608534 -0.14416684 -0.12087865\n", + " -0.1754675 -0.1442258 -0.12068226 -0.17611334 -0.14433552 -0.12093556\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan -0.16938321 -0.13763002 -0.11703902\n", + " -0.16953091 -0.13736586 -0.11695779 -0.16881837 -0.1375676 -0.11694438\n", + " -0.16927898 -0.13748177 -0.11689982 -0.16921265 -0.13757375 -0.11682524\n", + " -0.16915872 -0.13727377 -0.11694336 -0.16939766 -0.13734972 -0.1167447\n", + " -0.16924214 -0.1373768 -0.11674816 -0.16918278 -0.13746085 -0.1169816\n", + " -0.16927003 -0.13740063 -0.1169564 -0.16916501 -0.13752074 -0.11687641\n", + " -0.16928973 -0.13751536 -0.11697948 -0.16934836 -0.13727436 -0.11693615\n", + " -0.16912453 -0.13748699 -0.11693425 -0.1692788 -0.13750784 -0.11694655\n", + " -0.16919354 -0.13747437 -0.11708782 -0.16940009 -0.13757749 -0.11700586\n", + " -0.1692801 -0.13725384 -0.11684394 nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " -0.11606052 -0.1140225 -0.11403709 -0.11627212 -0.1139982 -0.11402075\n", + " -0.11613561 -0.11407941 -0.11420487 -0.11666225 -0.11462523 -0.11431901\n", + " -0.11604817 -0.11456211 -0.11392092 -0.11609343 -0.11394228 -0.11414071\n", + " -0.11611685 -0.11420178 -0.11405459 -0.11594404 -0.11408614 -0.11391662\n", + " -0.11590886 -0.11396465 -0.11389125 -0.11616694 -0.11441846 -0.11417015\n", + " -0.11617368 -0.11429765 -0.1139636 -0.11616763 -0.11433984 -0.11412121\n", + " -0.11625618 -0.11402999 -0.11419791 -0.11613603 -0.114206 -0.11423922\n", + " -0.1160801 -0.11431896 -0.11416734 -0.11608923 -0.11455498 -0.11417448\n", + " -0.11605165 -0.11427773 -0.11392205 -0.11606243 -0.11408421 -0.11395292\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan -0.11281447 -0.11245904 -0.11308822\n", + " -0.11256366 -0.11230094 -0.1130767 -0.11282651 -0.1121034 -0.11283479\n", + " -0.11260704 -0.1125136 -0.11288977 -0.11278304 -0.11242278 -0.11268564\n", + " -0.11263359 -0.11236227 -0.11329411 -0.11231603 -0.1124533 -0.11278826\n", + " -0.11291545 -0.11241223 -0.11250702 -0.11246481 -0.11228665 -0.11348916\n", + " -0.11250694 -0.11250274 -0.11298019 -0.11277323 -0.11248601 -0.11301753\n", + " -0.11259486 -0.1124685 -0.11285441 -0.11274424 -0.11232891 -0.11316456\n", + " -0.11274575 -0.11256149 -0.11252293 -0.11293524 -0.11261757 -0.11305628\n", + " -0.11253063 -0.11237109 -0.11278518 -0.1124074 -0.11276905 -0.11296684\n", + " -0.11258689 -0.11228467 -0.11331342 nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " -0.11292265 -0.11395193 -0.11564599 -0.11244356 -0.11338947 -0.1148266\n", + " -0.11295702 -0.11353862 -0.11510521 -0.11244347 -0.11387967 -0.11512396\n", + " -0.11269802 -0.11364442 -0.1151339 -0.11238356 -0.11364301 -0.11496543\n", + " -0.11229193 -0.11340926 -0.11550744 -0.11215818 -0.11367944 -0.11552889\n", + " -0.11240305 -0.11352309 -0.115412 -0.1128402 -0.11338749 -0.1153551\n", + " -0.11250042 -0.11347275 -0.11548445 -0.11271132 -0.11377527 -0.11558066\n", + " -0.11318598 -0.11325792 -0.11499103 -0.11253099 -0.1129829 -0.11530949\n", + " -0.11239074 -0.11329625 -0.11544761 -0.11262484 -0.11323392 -0.1151936\n", + " -0.11253889 -0.11382403 -0.11511129 -0.11250854 -0.11339898 -0.11536332\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan -0.11542253 -0.11498664 -0.11428517\n", + " -0.11503783 -0.11473447 -0.11458687 -0.11483866 -0.1154254 -0.11479037\n", + " -0.11533015 -0.11515195 -0.11460571 -0.11563491 -0.11433835 -0.11437413\n", + " -0.11510849 -0.11472156 -0.11516494 -0.11545009 -0.115001 -0.11479743\n", + " -0.11461761 -0.11537461 -0.11497109 -0.1155148 -0.11567353 -0.11431184\n", + " -0.11546067 -0.11462564 -0.11450721 -0.11511 -0.11487988 -0.11466523\n", + " -0.11585756 -0.11462611 -0.11433121 -0.11538152 -0.11463425 -0.11527088\n", + " -0.11509145 -0.11493588 -0.11484324 -0.11528905 -0.11426327 -0.11476508\n", + " -0.11499562 -0.11451299 -0.11466765 -0.11525918 -0.11469718 -0.11476983\n", + " -0.11467865 -0.1145067 -0.11479425 nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " -0.11352917 -0.1145882 -0.11643688 -0.11418115 -0.11442858 -0.11635549\n", + " -0.11408502 -0.11458383 -0.1163013 -0.1135842 -0.11453566 -0.11575264\n", + " -0.11341863 -0.11481638 -0.11635685 -0.1132144 -0.11438018 -0.11666005\n", + " -0.11311482 -0.11500883 -0.11594984 -0.11409228 -0.11464061 -0.1158012\n", + " -0.11389399 -0.11454081 -0.1157428 -0.11333869 -0.11438896 -0.11676006\n", + " -0.11382523 -0.11443669 -0.11606569 -0.11424726 -0.11464652 -0.11608159\n", + " -0.11396605 -0.11473188 -0.1167532 -0.1136805 -0.11455875 -0.11615814\n", + " -0.11372286 -0.11442829 -0.11590895 -0.1136509 -0.11368863 -0.11660073\n", + " -0.1136605 -0.1141187 -0.11613806 -0.11326355 -0.11427399 -0.11676148\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan -0.11573534 -0.11897501 -0.1226239\n", + " -0.1162633 -0.11939573 -0.12255715 -0.11636411 -0.11878021 -0.12306277\n", + " -0.11535113 -0.11813967 -0.1230085 -0.11594119 -0.11812955 -0.12217928\n", + " -0.11523023 -0.11843291 -0.12228252 -0.1159457 -0.11840108 -0.12181337\n", + " -0.11600134 -0.11790484 -0.12203724 -0.11579998 -0.11787918 -0.12317219\n", + " -0.11578704 -0.11837798 -0.12379234 -0.1155279 -0.11865384 -0.12319867\n", + " -0.11597008 -0.11886814 -0.12291788 -0.1162282 -0.11918752 -0.12363613\n", + " -0.11571473 -0.11805225 -0.12250506 -0.11640247 -0.11823175 -0.1226976\n", + " -0.11571549 -0.11813327 -0.12229009 -0.11621545 -0.11793769 -0.1229533\n", + " -0.11528287 -0.1183919 -0.12121653]\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Лучшие гиперпараметры для Gradient Boosting:\n", + "{'learning_rate': 0.1, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}\n" + ] + } + ], + "source": [ + "\n", + "from sklearn.ensemble import GradientBoostingRegressor\n", + "\n", + "param_grid_gb = {\n", + " 'n_estimators': [50, 100, 200],\n", + " 'learning_rate': [0.01, 0.1, 0.2],\n", + " 'max_depth': [3, 5, 7],\n", + " 'min_samples_split': [2, 5, 10],\n", + " 'min_samples_leaf': [1, 2, 4],\n", + " 'max_features': ['auto', 'sqrt', 'log2']\n", + "}\n", + "\n", + "grid_search_gb = GridSearchCV(\n", + " estimator=GradientBoostingRegressor(),\n", + " param_grid=param_grid_gb,\n", + " cv=5,\n", + " scoring='neg_mean_squared_error',\n", + " n_jobs=-1\n", + ")\n", + "\n", + "grid_search_gb.fit(x_train, y_train)\n", + "\n", + "# Вывод лучших гиперпараметров\n", + "print(\"Лучшие гиперпараметры для Gradient Boosting:\")\n", + "print(grid_search_gb.best_params_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3) Метод k-ближайших соседей" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Лучшие гиперпараметры для k-Nearest Neighbors:\n", + "{'algorithm': 'ball_tree', 'n_neighbors': 10, 'p': 1, 'weights': 'distance'}\n" + ] + } + ], + "source": [ + "from sklearn.neighbors import KNeighborsRegressor\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "param_grid_knn = {\n", + " 'n_neighbors': [3, 5, 7, 10],\n", + " 'weights': ['uniform', 'distance'],\n", + " 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],\n", + " 'p': [1, 2]\n", + "}\n", + "\n", + "grid_search_knn = GridSearchCV(\n", + " estimator=KNeighborsRegressor(),\n", + " param_grid=param_grid_knn,\n", + " cv=5,\n", + " scoring='neg_mean_squared_error',\n", + " n_jobs=-1\n", + ")\n", + "\n", + "grid_search_knn.fit(x_train, y_train)\n", + "\n", + "# Вывод лучших гиперпараметров\n", + "print(\"Лучшие гиперпараметры для k-Nearest Neighbors:\")\n", + "print(grid_search_knn.best_params_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Предсказание на тестовой выборке" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = model.predict(x_test)\n", + "y_pred_forest = model_forest.predict(x_test)\n", + "y_pred_lasso = model_lasso.predict(x_test)\n", + "y_pred_gb = model_gb.predict(x_test)\n", + "y_pred_neighbors = model_knn.predict(x_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Оценка качества модели" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1.\tMSE (Mean Squared Error)\n", + "Среднее значение квадратов разностей между предсказанными и фактическими значениями. Чем меньше значение, тем лучше модель." + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean Squared Error (MSE):\n", + "k-NN: \t\t\t0.213\n", + "Random Forest: \t\t0.118\n", + "Lasso: \t\t\t0.166\n", + "Gradient Boosting: \t0.113\n", + "k-Nearest Neighbors: \t0.326\n" + ] + } + ], + "source": [ + "from sklearn.metrics import mean_squared_error\n", + "import numpy as np\n", + "\n", + "mse1 = mean_squared_error(y_test, y_pred)\n", + "mse2 = mean_squared_error(y_test, y_pred_forest)\n", + "mse3 = mean_squared_error(y_test, y_pred_lasso)\n", + "mse4 = mean_squared_error(y_test, y_pred_gb)\n", + "mse5 = mean_squared_error(y_test, y_pred_neighbors)\n", + "\n", + "mse1_rounded = round(mse1, 3)\n", + "mse2_rounded = round(mse2, 3)\n", + "mse3_rounded = round(mse3, 3)\n", + "mse4_rounded = round(mse4, 3)\n", + "mse5_rounded = round(mse5, 3)\n", + "\n", + "print(\"Mean Squared Error (MSE):\")\n", + "print(f\"k-NN: \\t\\t\\t{mse1_rounded}\")\n", + "print(f\"Random Forest: \\t\\t{mse2_rounded}\")\n", + "print(f\"Lasso: \\t\\t\\t{mse3_rounded}\")\n", + "print(f\"Gradient Boosting: \\t{mse4_rounded}\")\n", + "print(f\"k-Nearest Neighbors: \\t{mse5_rounded}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2.\tMAE\n", + "Среднее значение абсолютных разностей между предсказанными и фактическими значениями. Чем меньше значение, тем лучше модель." + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean Absolute Error (MAE):\n", + "k-NN: \t\t\t0.213\n", + "Random Forest: \t\t0.238\n", + "Lasso: \t\t\t0.366\n", + "Gradient Boosting: \t0.246\n", + "k-Nearest Neighbors: \t0.485\n" + ] + } + ], + "source": [ + "from sklearn.metrics import mean_absolute_error\n", + "\n", + "mae1 = round(mean_absolute_error(y_test, y_pred),3)\n", + "mae2 = round(mean_absolute_error(y_test, y_pred_forest),3)\n", + "mae3 = round(mean_absolute_error(y_test, y_pred_lasso),3)\n", + "mae4 = round(mean_absolute_error(y_test, y_pred_gb),3)\n", + "mae5 = round(mean_absolute_error(y_test, y_pred_neighbors),3)\n", + "print(\"Mean Absolute Error (MAE):\")\n", + "print(f\"k-NN: \\t\\t\\t{mae1}\")\n", + "print(f\"Random Forest: \\t\\t{mae2}\")\n", + "print(f\"Lasso: \\t\\t\\t{mae3}\")\n", + "print(f\"Gradient Boosting: \\t{mae4}\")\n", + "print(f\"k-Nearest Neighbors: \\t{mae5}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.\tR-squared\n", + "Мера, показывающая, насколько хорошо модель объясняет изменчивость данных. Значение находится в диапазоне от 0 до 1, где 1 — идеальное соответствие, а 0 — модель не объясняет данные." + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "R² (R-squared): 0.127933821917115\n", + "\n", + "R² (R-squared):\n", + "k-NN: \t\t\t0.128\n", + "Random Forest: \t\t0.515\n", + "Lasso: \t\t\t0.319\n", + "Gradient Boosting: \t0.537\n", + "k-Nearest Neighbors: \t-0.337\n" + ] + } + ], + "source": [ + "from sklearn.metrics import r2_score\n", + "r2 = r2_score(y_test, y_pred)\n", + "print(f\"R² (R-squared): {r2}\")\n", + "\n", + "r2_1 = r2_score(y_test, y_pred)\n", + "r2_2 = r2_score(y_test, y_pred_forest)\n", + "r2_3 = r2_score(y_test, y_pred_lasso)\n", + "r2_4 = r2_score(y_test, y_pred_gb)\n", + "r2_5 = r2_score(y_test, y_pred_neighbors)\n", + "\n", + "r2_1_rounded = round(r2_1, 3)\n", + "r2_2_rounded = round(r2_2, 3)\n", + "r2_3_rounded = round(r2_3, 3)\n", + "r2_4_rounded = round(r2_4, 3)\n", + "r2_5_rounded = round(r2_5, 3)\n", + "\n", + "print(\"\\nR² (R-squared):\")\n", + "print(f\"k-NN: \\t\\t\\t{r2_1_rounded}\")\n", + "print(f\"Random Forest: \\t\\t{r2_2_rounded}\")\n", + "print(f\"Lasso: \\t\\t\\t{r2_3_rounded}\")\n", + "print(f\"Gradient Boosting: \\t{r2_4_rounded}\")\n", + "print(f\"k-Nearest Neighbors: \\t{r2_5_rounded}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "4.\tRMSE\n", + " Среднее отклонение предсказаний от реальных данных. Чем меньше модуль, тем лучше модель." + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Root Mean Squared Error (RMSE):\n", + "k-NN: \t\t\t0.461\n", + "Random Forest: \t\t0.344\n", + "Lasso: \t\t\t0.407\n", + "Gradient Boosting: \t0.336\n", + "k-Nearest Neighbors: \t0.571\n" + ] + } + ], + "source": [ + "rmse1 = np.sqrt(mse1)\n", + "rmse2 = np.sqrt(mse2)\n", + "rmse3 = np.sqrt(mse3)\n", + "rmse4 = np.sqrt(mse4)\n", + "rmse5 = np.sqrt(mse5)\n", + "\n", + "rmse1_rounded = round(rmse1, 3)\n", + "rmse2_rounded = round(rmse2, 3)\n", + "rmse3_rounded = round(rmse3, 3)\n", + "rmse4_rounded = round(rmse4, 3)\n", + "rmse5_rounded = round(rmse5, 3)\n", + "\n", + "print(\"Root Mean Squared Error (RMSE):\")\n", + "print(f\"k-NN: \\t\\t\\t{rmse1_rounded}\")\n", + "print(f\"Random Forest: \\t\\t{rmse2_rounded}\")\n", + "print(f\"Lasso: \\t\\t\\t{rmse3_rounded}\")\n", + "print(f\"Gradient Boosting: \\t{rmse4_rounded}\")\n", + "print(f\"k-Nearest Neighbors: \\t{rmse5_rounded}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Лучший результат – градиентный бустинг и случайный лес.\n", + "Положительные результаты по всем критериям получил случайный лес. Три из четырех положительных результата у градиентного бустинга. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Значит, случайный лес – наиболее точная и устойчивая стратегия обучения модели. Итоговая модель – model_forest." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Также, с помощью применение важности признаков (feature importance) на Случайном лесе, мы вывели основные факторы, вызывающие депрессию:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Feature Importance\n", + "13 Have you ever had suicidal thoughts ? 0.300542\n", + "5 Academic Pressure 0.134276\n", + "0 id 0.087970\n", + "7 CGPA 0.079078\n", + "2 Age 0.066613\n", + "15 Financial Stress 0.066330\n", + "3 City 0.059293\n", + "14 Work/Study Hours 0.052275\n", + "12 Degree 0.049539\n", + "8 Study Satisfaction 0.032944\n", + "11 Dietary Habits 0.026140\n", + "10 Sleep Duration 0.024435\n", + "16 Family History of Mental Illness 0.010547\n", + "1 Gender 0.009627\n", + "4 Profession 0.000372\n", + "9 Job Satisfaction 0.000017\n", + "6 Work Pressure 0.000003\n" + ] + } + ], + "source": [ + "from sklearn.ensemble import RandomForestRegressor\n", + "\n", + "model_rf = RandomForestRegressor(n_estimators=100, random_state=42)\n", + "model_rf.fit(x_train, y_train)\n", + "\n", + "feature_importances = model_rf.feature_importances_\n", + "\n", + "import pandas as pd\n", + "feature_importance_df = pd.DataFrame({\n", + " 'Feature': x.columns,\n", + " 'Importance': feature_importances\n", + "}).sort_values(by='Importance', ascending=False)\n", + "\n", + "print(feature_importance_df)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Scripts", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 688bfe2e9beae86d59743968c3b874c54296c2be Mon Sep 17 00:00:00 2001 From: dex_moth Date: Sat, 21 Dec 2024 00:21:03 +0400 Subject: [PATCH 4/7] =?UTF-8?q?=D0=A3=D0=B4=D0=B0=D0=BB=D0=B8=D1=82=D1=8C?= =?UTF-8?q?=20lab=5F4/Lab4.ipynb?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_4/Lab4.ipynb | 911 ----------------------------------------------- 1 file changed, 911 deletions(-) delete mode 100644 lab_4/Lab4.ipynb diff --git a/lab_4/Lab4.ipynb b/lab_4/Lab4.ipynb deleted file mode 100644 index 0b8116e..0000000 --- a/lab_4/Lab4.ipynb +++ /dev/null @@ -1,911 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'Gender', 'Age', 'City', 'Profession', 'Academic Pressure',\n", - " 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',\n", - " 'Sleep Duration', 'Dietary Habits', 'Degree',\n", - " 'Have you ever had suicidal thoughts ?', 'Work/Study Hours',\n", - " 'Financial Stress', 'Family History of Mental Illness', 'Depression'],\n", - " dtype='object')\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "from matplotlib.ticker import FuncFormatter\n", - "\n", - "df = pd.read_csv(\".//csv//Student Depression Dataset.csv\")\n", - "print(df.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " id Gender Age City Profession Academic Pressure \\\n", - "0 2 Male 33.0 Visakhapatnam Student 5.0 \n", - "1 8 Female 24.0 Bangalore Student 2.0 \n", - "2 26 Male 31.0 Srinagar Student 3.0 \n", - "3 30 Female 28.0 Varanasi Student 3.0 \n", - "4 32 Female 25.0 Jaipur Student 4.0 \n", - "\n", - " Work Pressure CGPA Study Satisfaction Job Satisfaction \\\n", - "0 0.0 8.97 2.0 0.0 \n", - "1 0.0 5.90 5.0 0.0 \n", - "2 0.0 7.03 5.0 0.0 \n", - "3 0.0 5.59 2.0 0.0 \n", - "4 0.0 8.13 3.0 0.0 \n", - "\n", - " Sleep Duration Dietary Habits Degree \\\n", - "0 5-6 hours Healthy B.Pharm \n", - "1 5-6 hours Moderate BSc \n", - "2 Less than 5 hours Healthy BA \n", - "3 7-8 hours Moderate BCA \n", - "4 5-6 hours Moderate M.Tech \n", - "\n", - " Have you ever had suicidal thoughts ? Work/Study Hours Financial Stress \\\n", - "0 Yes 3.0 1.0 \n", - "1 No 3.0 2.0 \n", - "2 No 9.0 1.0 \n", - "3 Yes 4.0 5.0 \n", - "4 Yes 1.0 1.0 \n", - "\n", - " Family History of Mental Illness Depression \n", - "0 No 1 \n", - "1 Yes 0 \n", - "2 Yes 0 \n", - "3 Yes 1 \n", - "4 No 0 \n" - ] - } - ], - "source": [ - "print(df.head())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Бизнес-цель исследования\n", - "Разработать и внедрить систему прогнозирования уровня депрессии среди обучающихся, которая позволит выявить группы риска на ранних этапах. Результаты исследования могут быть полезны психологам, педагогам и администрации учебных заведений.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Описание набора данных для анализа\n", - "Набор данных содержит информацию о психологическом состоянии обучающихся и включает следующие поля:\n", - "- id – идентификатор, число\n", - "- Gender – пол, строка\n", - "- Age – возраст, дробное число\n", - "- City – город, строка\n", - "- Profession – профессия, строка\n", - "- Academic Pressure – академическое давление, дробное число (от 1.00 до 5.00)\n", - "- Work Pressure – рабочее давление, дробное число (от 1.00 до 5.00)\n", - "- CGPA – средний балл (GPA), дробное число\n", - "- Study Satisfaction – удовлетворенность учебой, дробное число (от 1.00 до 5.00)\n", - "- Job Satisfaction – удовлетворенность работой, дробное число (от 1.00 до 5.00)\n", - "- Sleep Duration – продолжительность сна, строка\n", - "- Dietary Habits – пищевые привычки, строка\n", - "- Degree – степень (образование), строка\n", - "- Have you ever had suicidal thoughts? – Были ли у вас когда-либо суицидальные мысли? строка (yes/no)\n", - "- Work/Study Hours – часы работы/учебы, дробное число\n", - "- Financial Stress – финансовый стресс, дробное число (от 1.00 до 5.00)\n", - "- Family History of Mental Illness – семейный анамнез психических заболеваний, строка (yes/no)\n", - "- Depression – депрессия, булевое значение (1/0)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Обработка данных" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "id 0\n", - "Gender 0\n", - "Age 0\n", - "City 0\n", - "Profession 0\n", - "Academic Pressure 0\n", - "Work Pressure 0\n", - "CGPA 0\n", - "Study Satisfaction 0\n", - "Job Satisfaction 0\n", - "Sleep Duration 0\n", - "Dietary Habits 0\n", - "Degree 0\n", - "Have you ever had suicidal thoughts ? 0\n", - "Work/Study Hours 0\n", - "Financial Stress 3\n", - "Family History of Mental Illness 0\n", - "Depression 0\n", - "dtype: int64" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.isnull().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "df.dropna(subset=['Financial Stress'], inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "features = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', \n", - " 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress', 'Depression']\n", - "\n", - "plt.figure(figsize=(15, 10))\n", - "for i, feature in enumerate(features, 1):\n", - " plt.subplot(3, 3, i)\n", - " sns.boxplot(y=df[feature], color='skyblue')\n", - " plt.title(f'Boxplot of {feature}')\n", - " plt.ylabel(feature)\n", - "\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "В Age много выбросов. Сбалансируем данные" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "Q1 = df['Age'].quantile(0.25)\n", - "Q3 = df['Age'].quantile(0.75)\n", - "IQR = Q3 - Q1\n", - "\n", - "threshold = 1.5 * IQR\n", - "outliers = (df['Age'] < (Q1 - threshold)) | (df['Age'] > (Q3 + threshold))\n", - "\n", - "median_rating = df['Age'].median()\n", - "df.loc[outliers, 'Age'] = median_rating\n", - "\n", - "plt.figure(figsize=(8, 6))\n", - "sns.boxplot(y=df['Age'], color='skyblue')\n", - "plt.title('Boxplot of Age')\n", - "plt.ylabel('Age')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Конструирование признаков с помощью меток" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.preprocessing import LabelEncoder\n", - "\n", - "le = LabelEncoder()\n", - "df['Gender'] = le.fit_transform(df['Gender'])\n", - "df['City'] = le.fit_transform(df['City'])\n", - "df['Dietary Habits'] = le.fit_transform(df['Dietary Habits'])\n", - "df['Degree'] = le.fit_transform(df['Degree'])\n", - "df['Have you ever had suicidal thoughts ?'] = le.fit_transform(df['Have you ever had suicidal thoughts ?'])\n", - "df['Sleep Duration'] = le.fit_transform(df['Sleep Duration'])\n", - "df['Profession'] = le.fit_transform(df['Profession'])\n", - "df['Study Satisfaction'] = le.fit_transform(df['Study Satisfaction'])\n", - "df['Family History of Mental Illness'] = le.fit_transform(df['Family History of Mental Illness'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "разделение на признаки и целевую переменную" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "x = df.drop('Depression', axis=1)\n", - "y = df['Depression']" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1) Метод регрессии Лассо\n" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Лучшие гиперпараметры для Lasso:\n", - "{'alpha': 0.01, 'fit_intercept': False}\n" - ] - } - ], - "source": [ - "from sklearn.linear_model import Lasso\n", - "\n", - "param_grid_lasso = {\n", - " 'alpha': [0.01, 0.1, 1.0, 10.0],\n", - " 'fit_intercept': [True, False],\n", - "}\n", - "\n", - "# Создание объекта GridSearchCV\n", - "grid_search_lasso = GridSearchCV(\n", - " estimator=Lasso(), \n", - " param_grid=param_grid_lasso, \n", - " cv=5, \n", - " scoring='neg_mean_squared_error', \n", - " n_jobs=-1 \n", - ")\n", - "\n", - "grid_search_lasso.fit(x_train, y_train)\n", - "\n", - "# Вывод лучших гиперпараметров\n", - "print(\"Лучшие гиперпараметры для Lasso:\")\n", - "print(grid_search_lasso.best_params_)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2) Метод градиентного бустинга" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:540: FitFailedWarning: \n", - "1215 fits failed out of a total of 3645.\n", - "The score on these train-test partitions for these parameters will be set to nan.\n", - "If these failures are not expected, you can try to debug them by setting error_score='raise'.\n", - "\n", - "Below are more details about the failures:\n", - "--------------------------------------------------------------------------------\n", - "978 fits failed with the following error:\n", - "Traceback (most recent call last):\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 888, in _fit_and_score\n", - " estimator.fit(X_train, y_train, **fit_params)\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 1466, in wrapper\n", - " estimator._validate_params()\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 666, in _validate_params\n", - " validate_parameter_constraints(\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py\", line 95, in validate_parameter_constraints\n", - " raise InvalidParameterError(\n", - "sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of GradientBoostingRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto' instead.\n", - "\n", - "--------------------------------------------------------------------------------\n", - "237 fits failed with the following error:\n", - "Traceback (most recent call last):\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 888, in _fit_and_score\n", - " estimator.fit(X_train, y_train, **fit_params)\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 1466, in wrapper\n", - " estimator._validate_params()\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 666, in _validate_params\n", - " validate_parameter_constraints(\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py\", line 95, in validate_parameter_constraints\n", - " raise InvalidParameterError(\n", - "sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of GradientBoostingRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.\n", - "\n", - " warnings.warn(some_fits_failed_message, FitFailedWarning)\n", - "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\numpy\\ma\\core.py:2881: RuntimeWarning: invalid value encountered in cast\n", - " _data = np.array(data, dtype=dtype, copy=copy,\n", - "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:1103: UserWarning: One or more of the test scores are non-finite: [ nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan -0.18767441 -0.15799837 -0.13080278\n", - " -0.18762913 -0.15792709 -0.13056114 -0.18792038 -0.15737146 -0.130218\n", - " -0.18725961 -0.157967 -0.13047453 -0.18766583 -0.15779565 -0.13094863\n", - " -0.18798705 -0.15693978 -0.13061215 -0.18766317 -0.15746848 -0.13072918\n", - " -0.18864158 -0.15666133 -0.13095037 -0.18817206 -0.15805489 -0.13086126\n", - " -0.18707465 -0.15864932 -0.13104947 -0.18818902 -0.15828572 -0.13063871\n", - " -0.18701628 -0.15853864 -0.13019458 -0.18740927 -0.15836397 -0.13065455\n", - " -0.18768748 -0.15828297 -0.1309458 -0.18845004 -0.15696395 -0.13023062\n", - " -0.18754854 -0.15899615 -0.13061707 -0.18831427 -0.15819939 -0.13096524\n", - " -0.18662963 -0.15815869 -0.13089186 nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " -0.1758914 -0.1442684 -0.12093344 -0.1758927 -0.14423731 -0.12084543\n", - " -0.17573339 -0.14419842 -0.12076166 -0.17512045 -0.14435454 -0.1207299\n", - " -0.17669645 -0.14397965 -0.12087019 -0.17605424 -0.1438664 -0.12091068\n", - " -0.17582192 -0.1443651 -0.12097165 -0.17588422 -0.14421003 -0.12081764\n", - " -0.17522742 -0.14424357 -0.12086484 -0.17530986 -0.14433713 -0.12091757\n", - " -0.17565647 -0.14408902 -0.12075918 -0.17561884 -0.14426355 -0.12094066\n", - " -0.17522371 -0.1439869 -0.12099023 -0.17619772 -0.14396131 -0.12079667\n", - " -0.17710789 -0.1448419 -0.12087822 -0.17608534 -0.14416684 -0.12087865\n", - " -0.1754675 -0.1442258 -0.12068226 -0.17611334 -0.14433552 -0.12093556\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan -0.16938321 -0.13763002 -0.11703902\n", - " -0.16953091 -0.13736586 -0.11695779 -0.16881837 -0.1375676 -0.11694438\n", - " -0.16927898 -0.13748177 -0.11689982 -0.16921265 -0.13757375 -0.11682524\n", - " -0.16915872 -0.13727377 -0.11694336 -0.16939766 -0.13734972 -0.1167447\n", - " -0.16924214 -0.1373768 -0.11674816 -0.16918278 -0.13746085 -0.1169816\n", - " -0.16927003 -0.13740063 -0.1169564 -0.16916501 -0.13752074 -0.11687641\n", - " -0.16928973 -0.13751536 -0.11697948 -0.16934836 -0.13727436 -0.11693615\n", - " -0.16912453 -0.13748699 -0.11693425 -0.1692788 -0.13750784 -0.11694655\n", - " -0.16919354 -0.13747437 -0.11708782 -0.16940009 -0.13757749 -0.11700586\n", - " -0.1692801 -0.13725384 -0.11684394 nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " -0.11606052 -0.1140225 -0.11403709 -0.11627212 -0.1139982 -0.11402075\n", - " -0.11613561 -0.11407941 -0.11420487 -0.11666225 -0.11462523 -0.11431901\n", - " -0.11604817 -0.11456211 -0.11392092 -0.11609343 -0.11394228 -0.11414071\n", - " -0.11611685 -0.11420178 -0.11405459 -0.11594404 -0.11408614 -0.11391662\n", - " -0.11590886 -0.11396465 -0.11389125 -0.11616694 -0.11441846 -0.11417015\n", - " -0.11617368 -0.11429765 -0.1139636 -0.11616763 -0.11433984 -0.11412121\n", - " -0.11625618 -0.11402999 -0.11419791 -0.11613603 -0.114206 -0.11423922\n", - " -0.1160801 -0.11431896 -0.11416734 -0.11608923 -0.11455498 -0.11417448\n", - " -0.11605165 -0.11427773 -0.11392205 -0.11606243 -0.11408421 -0.11395292\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan -0.11281447 -0.11245904 -0.11308822\n", - " -0.11256366 -0.11230094 -0.1130767 -0.11282651 -0.1121034 -0.11283479\n", - " -0.11260704 -0.1125136 -0.11288977 -0.11278304 -0.11242278 -0.11268564\n", - " -0.11263359 -0.11236227 -0.11329411 -0.11231603 -0.1124533 -0.11278826\n", - " -0.11291545 -0.11241223 -0.11250702 -0.11246481 -0.11228665 -0.11348916\n", - " -0.11250694 -0.11250274 -0.11298019 -0.11277323 -0.11248601 -0.11301753\n", - " -0.11259486 -0.1124685 -0.11285441 -0.11274424 -0.11232891 -0.11316456\n", - " -0.11274575 -0.11256149 -0.11252293 -0.11293524 -0.11261757 -0.11305628\n", - " -0.11253063 -0.11237109 -0.11278518 -0.1124074 -0.11276905 -0.11296684\n", - " -0.11258689 -0.11228467 -0.11331342 nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " -0.11292265 -0.11395193 -0.11564599 -0.11244356 -0.11338947 -0.1148266\n", - " -0.11295702 -0.11353862 -0.11510521 -0.11244347 -0.11387967 -0.11512396\n", - " -0.11269802 -0.11364442 -0.1151339 -0.11238356 -0.11364301 -0.11496543\n", - " -0.11229193 -0.11340926 -0.11550744 -0.11215818 -0.11367944 -0.11552889\n", - " -0.11240305 -0.11352309 -0.115412 -0.1128402 -0.11338749 -0.1153551\n", - " -0.11250042 -0.11347275 -0.11548445 -0.11271132 -0.11377527 -0.11558066\n", - " -0.11318598 -0.11325792 -0.11499103 -0.11253099 -0.1129829 -0.11530949\n", - " -0.11239074 -0.11329625 -0.11544761 -0.11262484 -0.11323392 -0.1151936\n", - " -0.11253889 -0.11382403 -0.11511129 -0.11250854 -0.11339898 -0.11536332\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan -0.11542253 -0.11498664 -0.11428517\n", - " -0.11503783 -0.11473447 -0.11458687 -0.11483866 -0.1154254 -0.11479037\n", - " -0.11533015 -0.11515195 -0.11460571 -0.11563491 -0.11433835 -0.11437413\n", - " -0.11510849 -0.11472156 -0.11516494 -0.11545009 -0.115001 -0.11479743\n", - " -0.11461761 -0.11537461 -0.11497109 -0.1155148 -0.11567353 -0.11431184\n", - " -0.11546067 -0.11462564 -0.11450721 -0.11511 -0.11487988 -0.11466523\n", - " -0.11585756 -0.11462611 -0.11433121 -0.11538152 -0.11463425 -0.11527088\n", - " -0.11509145 -0.11493588 -0.11484324 -0.11528905 -0.11426327 -0.11476508\n", - " -0.11499562 -0.11451299 -0.11466765 -0.11525918 -0.11469718 -0.11476983\n", - " -0.11467865 -0.1145067 -0.11479425 nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " -0.11352917 -0.1145882 -0.11643688 -0.11418115 -0.11442858 -0.11635549\n", - " -0.11408502 -0.11458383 -0.1163013 -0.1135842 -0.11453566 -0.11575264\n", - " -0.11341863 -0.11481638 -0.11635685 -0.1132144 -0.11438018 -0.11666005\n", - " -0.11311482 -0.11500883 -0.11594984 -0.11409228 -0.11464061 -0.1158012\n", - " -0.11389399 -0.11454081 -0.1157428 -0.11333869 -0.11438896 -0.11676006\n", - " -0.11382523 -0.11443669 -0.11606569 -0.11424726 -0.11464652 -0.11608159\n", - " -0.11396605 -0.11473188 -0.1167532 -0.1136805 -0.11455875 -0.11615814\n", - " -0.11372286 -0.11442829 -0.11590895 -0.1136509 -0.11368863 -0.11660073\n", - " -0.1136605 -0.1141187 -0.11613806 -0.11326355 -0.11427399 -0.11676148\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan -0.11573534 -0.11897501 -0.1226239\n", - " -0.1162633 -0.11939573 -0.12255715 -0.11636411 -0.11878021 -0.12306277\n", - " -0.11535113 -0.11813967 -0.1230085 -0.11594119 -0.11812955 -0.12217928\n", - " -0.11523023 -0.11843291 -0.12228252 -0.1159457 -0.11840108 -0.12181337\n", - " -0.11600134 -0.11790484 -0.12203724 -0.11579998 -0.11787918 -0.12317219\n", - " -0.11578704 -0.11837798 -0.12379234 -0.1155279 -0.11865384 -0.12319867\n", - " -0.11597008 -0.11886814 -0.12291788 -0.1162282 -0.11918752 -0.12363613\n", - " -0.11571473 -0.11805225 -0.12250506 -0.11640247 -0.11823175 -0.1226976\n", - " -0.11571549 -0.11813327 -0.12229009 -0.11621545 -0.11793769 -0.1229533\n", - " -0.11528287 -0.1183919 -0.12121653]\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Лучшие гиперпараметры для Gradient Boosting:\n", - "{'learning_rate': 0.1, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}\n" - ] - } - ], - "source": [ - "\n", - "from sklearn.ensemble import GradientBoostingRegressor\n", - "\n", - "param_grid_gb = {\n", - " 'n_estimators': [50, 100, 200],\n", - " 'learning_rate': [0.01, 0.1, 0.2],\n", - " 'max_depth': [3, 5, 7],\n", - " 'min_samples_split': [2, 5, 10],\n", - " 'min_samples_leaf': [1, 2, 4],\n", - " 'max_features': ['auto', 'sqrt', 'log2']\n", - "}\n", - "\n", - "grid_search_gb = GridSearchCV(\n", - " estimator=GradientBoostingRegressor(),\n", - " param_grid=param_grid_gb,\n", - " cv=5,\n", - " scoring='neg_mean_squared_error',\n", - " n_jobs=-1\n", - ")\n", - "\n", - "grid_search_gb.fit(x_train, y_train)\n", - "\n", - "# Вывод лучших гиперпараметров\n", - "print(\"Лучшие гиперпараметры для Gradient Boosting:\")\n", - "print(grid_search_gb.best_params_)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 3) Метод k-ближайших соседей" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Лучшие гиперпараметры для k-Nearest Neighbors:\n", - "{'algorithm': 'ball_tree', 'n_neighbors': 10, 'p': 1, 'weights': 'distance'}\n" - ] - } - ], - "source": [ - "from sklearn.neighbors import KNeighborsRegressor\n", - "from sklearn.model_selection import GridSearchCV\n", - "\n", - "param_grid_knn = {\n", - " 'n_neighbors': [3, 5, 7, 10],\n", - " 'weights': ['uniform', 'distance'],\n", - " 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],\n", - " 'p': [1, 2]\n", - "}\n", - "\n", - "grid_search_knn = GridSearchCV(\n", - " estimator=KNeighborsRegressor(),\n", - " param_grid=param_grid_knn,\n", - " cv=5,\n", - " scoring='neg_mean_squared_error',\n", - " n_jobs=-1\n", - ")\n", - "\n", - "grid_search_knn.fit(x_train, y_train)\n", - "\n", - "# Вывод лучших гиперпараметров\n", - "print(\"Лучшие гиперпараметры для k-Nearest Neighbors:\")\n", - "print(grid_search_knn.best_params_)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Предсказание на тестовой выборке" - ] - }, - { - "cell_type": "code", - "execution_count": 128, - "metadata": {}, - "outputs": [], - "source": [ - "y_pred = model.predict(x_test)\n", - "y_pred_forest = model_forest.predict(x_test)\n", - "y_pred_lasso = model_lasso.predict(x_test)\n", - "y_pred_gb = model_gb.predict(x_test)\n", - "y_pred_neighbors = model_knn.predict(x_test)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Оценка качества модели" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "1.\tMSE (Mean Squared Error)\n", - "Среднее значение квадратов разностей между предсказанными и фактическими значениями. Чем меньше значение, тем лучше модель." - ] - }, - { - "cell_type": "code", - "execution_count": 156, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Mean Squared Error (MSE):\n", - "k-NN: \t\t\t0.213\n", - "Random Forest: \t\t0.118\n", - "Lasso: \t\t\t0.166\n", - "Gradient Boosting: \t0.113\n", - "k-Nearest Neighbors: \t0.326\n" - ] - } - ], - "source": [ - "from sklearn.metrics import mean_squared_error\n", - "import numpy as np\n", - "\n", - "mse1 = mean_squared_error(y_test, y_pred)\n", - "mse2 = mean_squared_error(y_test, y_pred_forest)\n", - "mse3 = mean_squared_error(y_test, y_pred_lasso)\n", - "mse4 = mean_squared_error(y_test, y_pred_gb)\n", - "mse5 = mean_squared_error(y_test, y_pred_neighbors)\n", - "\n", - "mse1_rounded = round(mse1, 3)\n", - "mse2_rounded = round(mse2, 3)\n", - "mse3_rounded = round(mse3, 3)\n", - "mse4_rounded = round(mse4, 3)\n", - "mse5_rounded = round(mse5, 3)\n", - "\n", - "print(\"Mean Squared Error (MSE):\")\n", - "print(f\"k-NN: \\t\\t\\t{mse1_rounded}\")\n", - "print(f\"Random Forest: \\t\\t{mse2_rounded}\")\n", - "print(f\"Lasso: \\t\\t\\t{mse3_rounded}\")\n", - "print(f\"Gradient Boosting: \\t{mse4_rounded}\")\n", - "print(f\"k-Nearest Neighbors: \\t{mse5_rounded}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "2.\tMAE\n", - "Среднее значение абсолютных разностей между предсказанными и фактическими значениями. Чем меньше значение, тем лучше модель." - ] - }, - { - "cell_type": "code", - "execution_count": 155, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Mean Absolute Error (MAE):\n", - "k-NN: \t\t\t0.213\n", - "Random Forest: \t\t0.238\n", - "Lasso: \t\t\t0.366\n", - "Gradient Boosting: \t0.246\n", - "k-Nearest Neighbors: \t0.485\n" - ] - } - ], - "source": [ - "from sklearn.metrics import mean_absolute_error\n", - "\n", - "mae1 = round(mean_absolute_error(y_test, y_pred),3)\n", - "mae2 = round(mean_absolute_error(y_test, y_pred_forest),3)\n", - "mae3 = round(mean_absolute_error(y_test, y_pred_lasso),3)\n", - "mae4 = round(mean_absolute_error(y_test, y_pred_gb),3)\n", - "mae5 = round(mean_absolute_error(y_test, y_pred_neighbors),3)\n", - "print(\"Mean Absolute Error (MAE):\")\n", - "print(f\"k-NN: \\t\\t\\t{mae1}\")\n", - "print(f\"Random Forest: \\t\\t{mae2}\")\n", - "print(f\"Lasso: \\t\\t\\t{mae3}\")\n", - "print(f\"Gradient Boosting: \\t{mae4}\")\n", - "print(f\"k-Nearest Neighbors: \\t{mae5}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "3.\tR-squared\n", - "Мера, показывающая, насколько хорошо модель объясняет изменчивость данных. Значение находится в диапазоне от 0 до 1, где 1 — идеальное соответствие, а 0 — модель не объясняет данные." - ] - }, - { - "cell_type": "code", - "execution_count": 153, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "R² (R-squared): 0.127933821917115\n", - "\n", - "R² (R-squared):\n", - "k-NN: \t\t\t0.128\n", - "Random Forest: \t\t0.515\n", - "Lasso: \t\t\t0.319\n", - "Gradient Boosting: \t0.537\n", - "k-Nearest Neighbors: \t-0.337\n" - ] - } - ], - "source": [ - "from sklearn.metrics import r2_score\n", - "r2 = r2_score(y_test, y_pred)\n", - "print(f\"R² (R-squared): {r2}\")\n", - "\n", - "r2_1 = r2_score(y_test, y_pred)\n", - "r2_2 = r2_score(y_test, y_pred_forest)\n", - "r2_3 = r2_score(y_test, y_pred_lasso)\n", - "r2_4 = r2_score(y_test, y_pred_gb)\n", - "r2_5 = r2_score(y_test, y_pred_neighbors)\n", - "\n", - "r2_1_rounded = round(r2_1, 3)\n", - "r2_2_rounded = round(r2_2, 3)\n", - "r2_3_rounded = round(r2_3, 3)\n", - "r2_4_rounded = round(r2_4, 3)\n", - "r2_5_rounded = round(r2_5, 3)\n", - "\n", - "print(\"\\nR² (R-squared):\")\n", - "print(f\"k-NN: \\t\\t\\t{r2_1_rounded}\")\n", - "print(f\"Random Forest: \\t\\t{r2_2_rounded}\")\n", - "print(f\"Lasso: \\t\\t\\t{r2_3_rounded}\")\n", - "print(f\"Gradient Boosting: \\t{r2_4_rounded}\")\n", - "print(f\"k-Nearest Neighbors: \\t{r2_5_rounded}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "4.\tRMSE\n", - " Среднее отклонение предсказаний от реальных данных. Чем меньше модуль, тем лучше модель." - ] - }, - { - "cell_type": "code", - "execution_count": 151, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Root Mean Squared Error (RMSE):\n", - "k-NN: \t\t\t0.461\n", - "Random Forest: \t\t0.344\n", - "Lasso: \t\t\t0.407\n", - "Gradient Boosting: \t0.336\n", - "k-Nearest Neighbors: \t0.571\n" - ] - } - ], - "source": [ - "rmse1 = np.sqrt(mse1)\n", - "rmse2 = np.sqrt(mse2)\n", - "rmse3 = np.sqrt(mse3)\n", - "rmse4 = np.sqrt(mse4)\n", - "rmse5 = np.sqrt(mse5)\n", - "\n", - "rmse1_rounded = round(rmse1, 3)\n", - "rmse2_rounded = round(rmse2, 3)\n", - "rmse3_rounded = round(rmse3, 3)\n", - "rmse4_rounded = round(rmse4, 3)\n", - "rmse5_rounded = round(rmse5, 3)\n", - "\n", - "print(\"Root Mean Squared Error (RMSE):\")\n", - "print(f\"k-NN: \\t\\t\\t{rmse1_rounded}\")\n", - "print(f\"Random Forest: \\t\\t{rmse2_rounded}\")\n", - "print(f\"Lasso: \\t\\t\\t{rmse3_rounded}\")\n", - "print(f\"Gradient Boosting: \\t{rmse4_rounded}\")\n", - "print(f\"k-Nearest Neighbors: \\t{rmse5_rounded}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Лучший результат – градиентный бустинг и случайный лес.\n", - "Положительные результаты по всем критериям получил случайный лес. Три из четырех положительных результата у градиентного бустинга. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Значит, случайный лес – наиболее точная и устойчивая стратегия обучения модели. Итоговая модель – model_forest." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Также, с помощью применение важности признаков (feature importance) на Случайном лесе, мы вывели основные факторы, вызывающие депрессию:" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Feature Importance\n", - "13 Have you ever had suicidal thoughts ? 0.300542\n", - "5 Academic Pressure 0.134276\n", - "0 id 0.087970\n", - "7 CGPA 0.079078\n", - "2 Age 0.066613\n", - "15 Financial Stress 0.066330\n", - "3 City 0.059293\n", - "14 Work/Study Hours 0.052275\n", - "12 Degree 0.049539\n", - "8 Study Satisfaction 0.032944\n", - "11 Dietary Habits 0.026140\n", - "10 Sleep Duration 0.024435\n", - "16 Family History of Mental Illness 0.010547\n", - "1 Gender 0.009627\n", - "4 Profession 0.000372\n", - "9 Job Satisfaction 0.000017\n", - "6 Work Pressure 0.000003\n" - ] - } - ], - "source": [ - "from sklearn.ensemble import RandomForestRegressor\n", - "\n", - "model_rf = RandomForestRegressor(n_estimators=100, random_state=42)\n", - "model_rf.fit(x_train, y_train)\n", - "\n", - "feature_importances = model_rf.feature_importances_\n", - "\n", - "import pandas as pd\n", - "feature_importance_df = pd.DataFrame({\n", - " 'Feature': x.columns,\n", - " 'Importance': feature_importances\n", - "}).sort_values(by='Importance', ascending=False)\n", - "\n", - "print(feature_importance_df)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Scripts", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 8e79432bfa2fb82eb4a66a6812e80d5a508012e0 Mon Sep 17 00:00:00 2001 From: dex_moth Date: Sat, 21 Dec 2024 00:25:04 +0400 Subject: [PATCH 5/7] lab4 --- lab_4/Lab4.ipynb | 911 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 911 insertions(+) create mode 100644 lab_4/Lab4.ipynb diff --git a/lab_4/Lab4.ipynb b/lab_4/Lab4.ipynb new file mode 100644 index 0000000..0b8116e --- /dev/null +++ b/lab_4/Lab4.ipynb @@ -0,0 +1,911 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['id', 'Gender', 'Age', 'City', 'Profession', 'Academic Pressure',\n", + " 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',\n", + " 'Sleep Duration', 'Dietary Habits', 'Degree',\n", + " 'Have you ever had suicidal thoughts ?', 'Work/Study Hours',\n", + " 'Financial Stress', 'Family History of Mental Illness', 'Depression'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from matplotlib.ticker import FuncFormatter\n", + "\n", + "df = pd.read_csv(\".//csv//Student Depression Dataset.csv\")\n", + "print(df.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " id Gender Age City Profession Academic Pressure \\\n", + "0 2 Male 33.0 Visakhapatnam Student 5.0 \n", + "1 8 Female 24.0 Bangalore Student 2.0 \n", + "2 26 Male 31.0 Srinagar Student 3.0 \n", + "3 30 Female 28.0 Varanasi Student 3.0 \n", + "4 32 Female 25.0 Jaipur Student 4.0 \n", + "\n", + " Work Pressure CGPA Study Satisfaction Job Satisfaction \\\n", + "0 0.0 8.97 2.0 0.0 \n", + "1 0.0 5.90 5.0 0.0 \n", + "2 0.0 7.03 5.0 0.0 \n", + "3 0.0 5.59 2.0 0.0 \n", + "4 0.0 8.13 3.0 0.0 \n", + "\n", + " Sleep Duration Dietary Habits Degree \\\n", + "0 5-6 hours Healthy B.Pharm \n", + "1 5-6 hours Moderate BSc \n", + "2 Less than 5 hours Healthy BA \n", + "3 7-8 hours Moderate BCA \n", + "4 5-6 hours Moderate M.Tech \n", + "\n", + " Have you ever had suicidal thoughts ? Work/Study Hours Financial Stress \\\n", + "0 Yes 3.0 1.0 \n", + "1 No 3.0 2.0 \n", + "2 No 9.0 1.0 \n", + "3 Yes 4.0 5.0 \n", + "4 Yes 1.0 1.0 \n", + "\n", + " Family History of Mental Illness Depression \n", + "0 No 1 \n", + "1 Yes 0 \n", + "2 Yes 0 \n", + "3 Yes 1 \n", + "4 No 0 \n" + ] + } + ], + "source": [ + "print(df.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Бизнес-цель исследования\n", + "Разработать и внедрить систему прогнозирования уровня депрессии среди обучающихся, которая позволит выявить группы риска на ранних этапах. Результаты исследования могут быть полезны психологам, педагогам и администрации учебных заведений.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Описание набора данных для анализа\n", + "Набор данных содержит информацию о психологическом состоянии обучающихся и включает следующие поля:\n", + "- id – идентификатор, число\n", + "- Gender – пол, строка\n", + "- Age – возраст, дробное число\n", + "- City – город, строка\n", + "- Profession – профессия, строка\n", + "- Academic Pressure – академическое давление, дробное число (от 1.00 до 5.00)\n", + "- Work Pressure – рабочее давление, дробное число (от 1.00 до 5.00)\n", + "- CGPA – средний балл (GPA), дробное число\n", + "- Study Satisfaction – удовлетворенность учебой, дробное число (от 1.00 до 5.00)\n", + "- Job Satisfaction – удовлетворенность работой, дробное число (от 1.00 до 5.00)\n", + "- Sleep Duration – продолжительность сна, строка\n", + "- Dietary Habits – пищевые привычки, строка\n", + "- Degree – степень (образование), строка\n", + "- Have you ever had suicidal thoughts? – Были ли у вас когда-либо суицидальные мысли? строка (yes/no)\n", + "- Work/Study Hours – часы работы/учебы, дробное число\n", + "- Financial Stress – финансовый стресс, дробное число (от 1.00 до 5.00)\n", + "- Family History of Mental Illness – семейный анамнез психических заболеваний, строка (yes/no)\n", + "- Depression – депрессия, булевое значение (1/0)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Обработка данных" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "Gender 0\n", + "Age 0\n", + "City 0\n", + "Profession 0\n", + "Academic Pressure 0\n", + "Work Pressure 0\n", + "CGPA 0\n", + "Study Satisfaction 0\n", + "Job Satisfaction 0\n", + "Sleep Duration 0\n", + "Dietary Habits 0\n", + "Degree 0\n", + "Have you ever had suicidal thoughts ? 0\n", + "Work/Study Hours 0\n", + "Financial Stress 3\n", + "Family History of Mental Illness 0\n", + "Depression 0\n", + "dtype: int64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "df.dropna(subset=['Financial Stress'], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "features = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', \n", + " 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress', 'Depression']\n", + "\n", + "plt.figure(figsize=(15, 10))\n", + "for i, feature in enumerate(features, 1):\n", + " plt.subplot(3, 3, i)\n", + " sns.boxplot(y=df[feature], color='skyblue')\n", + " plt.title(f'Boxplot of {feature}')\n", + " plt.ylabel(feature)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В Age много выбросов. Сбалансируем данные" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "Q1 = df['Age'].quantile(0.25)\n", + "Q3 = df['Age'].quantile(0.75)\n", + "IQR = Q3 - Q1\n", + "\n", + "threshold = 1.5 * IQR\n", + "outliers = (df['Age'] < (Q1 - threshold)) | (df['Age'] > (Q3 + threshold))\n", + "\n", + "median_rating = df['Age'].median()\n", + "df.loc[outliers, 'Age'] = median_rating\n", + "\n", + "plt.figure(figsize=(8, 6))\n", + "sns.boxplot(y=df['Age'], color='skyblue')\n", + "plt.title('Boxplot of Age')\n", + "plt.ylabel('Age')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Конструирование признаков с помощью меток" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "le = LabelEncoder()\n", + "df['Gender'] = le.fit_transform(df['Gender'])\n", + "df['City'] = le.fit_transform(df['City'])\n", + "df['Dietary Habits'] = le.fit_transform(df['Dietary Habits'])\n", + "df['Degree'] = le.fit_transform(df['Degree'])\n", + "df['Have you ever had suicidal thoughts ?'] = le.fit_transform(df['Have you ever had suicidal thoughts ?'])\n", + "df['Sleep Duration'] = le.fit_transform(df['Sleep Duration'])\n", + "df['Profession'] = le.fit_transform(df['Profession'])\n", + "df['Study Satisfaction'] = le.fit_transform(df['Study Satisfaction'])\n", + "df['Family History of Mental Illness'] = le.fit_transform(df['Family History of Mental Illness'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "разделение на признаки и целевую переменную" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "x = df.drop('Depression', axis=1)\n", + "y = df['Depression']" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1) Метод регрессии Лассо\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Лучшие гиперпараметры для Lasso:\n", + "{'alpha': 0.01, 'fit_intercept': False}\n" + ] + } + ], + "source": [ + "from sklearn.linear_model import Lasso\n", + "\n", + "param_grid_lasso = {\n", + " 'alpha': [0.01, 0.1, 1.0, 10.0],\n", + " 'fit_intercept': [True, False],\n", + "}\n", + "\n", + "# Создание объекта GridSearchCV\n", + "grid_search_lasso = GridSearchCV(\n", + " estimator=Lasso(), \n", + " param_grid=param_grid_lasso, \n", + " cv=5, \n", + " scoring='neg_mean_squared_error', \n", + " n_jobs=-1 \n", + ")\n", + "\n", + "grid_search_lasso.fit(x_train, y_train)\n", + "\n", + "# Вывод лучших гиперпараметров\n", + "print(\"Лучшие гиперпараметры для Lasso:\")\n", + "print(grid_search_lasso.best_params_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2) Метод градиентного бустинга" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:540: FitFailedWarning: \n", + "1215 fits failed out of a total of 3645.\n", + "The score on these train-test partitions for these parameters will be set to nan.\n", + "If these failures are not expected, you can try to debug them by setting error_score='raise'.\n", + "\n", + "Below are more details about the failures:\n", + "--------------------------------------------------------------------------------\n", + "978 fits failed with the following error:\n", + "Traceback (most recent call last):\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 888, in _fit_and_score\n", + " estimator.fit(X_train, y_train, **fit_params)\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 1466, in wrapper\n", + " estimator._validate_params()\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 666, in _validate_params\n", + " validate_parameter_constraints(\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py\", line 95, in validate_parameter_constraints\n", + " raise InvalidParameterError(\n", + "sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of GradientBoostingRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto' instead.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "237 fits failed with the following error:\n", + "Traceback (most recent call last):\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 888, in _fit_and_score\n", + " estimator.fit(X_train, y_train, **fit_params)\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 1466, in wrapper\n", + " estimator._validate_params()\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 666, in _validate_params\n", + " validate_parameter_constraints(\n", + " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py\", line 95, in validate_parameter_constraints\n", + " raise InvalidParameterError(\n", + "sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of GradientBoostingRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.\n", + "\n", + " warnings.warn(some_fits_failed_message, FitFailedWarning)\n", + "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\numpy\\ma\\core.py:2881: RuntimeWarning: invalid value encountered in cast\n", + " _data = np.array(data, dtype=dtype, copy=copy,\n", + "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:1103: UserWarning: One or more of the test scores are non-finite: [ nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan -0.18767441 -0.15799837 -0.13080278\n", + " -0.18762913 -0.15792709 -0.13056114 -0.18792038 -0.15737146 -0.130218\n", + " -0.18725961 -0.157967 -0.13047453 -0.18766583 -0.15779565 -0.13094863\n", + " -0.18798705 -0.15693978 -0.13061215 -0.18766317 -0.15746848 -0.13072918\n", + " -0.18864158 -0.15666133 -0.13095037 -0.18817206 -0.15805489 -0.13086126\n", + " -0.18707465 -0.15864932 -0.13104947 -0.18818902 -0.15828572 -0.13063871\n", + " -0.18701628 -0.15853864 -0.13019458 -0.18740927 -0.15836397 -0.13065455\n", + " -0.18768748 -0.15828297 -0.1309458 -0.18845004 -0.15696395 -0.13023062\n", + " -0.18754854 -0.15899615 -0.13061707 -0.18831427 -0.15819939 -0.13096524\n", + " -0.18662963 -0.15815869 -0.13089186 nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " -0.1758914 -0.1442684 -0.12093344 -0.1758927 -0.14423731 -0.12084543\n", + " -0.17573339 -0.14419842 -0.12076166 -0.17512045 -0.14435454 -0.1207299\n", + " -0.17669645 -0.14397965 -0.12087019 -0.17605424 -0.1438664 -0.12091068\n", + " -0.17582192 -0.1443651 -0.12097165 -0.17588422 -0.14421003 -0.12081764\n", + " -0.17522742 -0.14424357 -0.12086484 -0.17530986 -0.14433713 -0.12091757\n", + " -0.17565647 -0.14408902 -0.12075918 -0.17561884 -0.14426355 -0.12094066\n", + " -0.17522371 -0.1439869 -0.12099023 -0.17619772 -0.14396131 -0.12079667\n", + " -0.17710789 -0.1448419 -0.12087822 -0.17608534 -0.14416684 -0.12087865\n", + " -0.1754675 -0.1442258 -0.12068226 -0.17611334 -0.14433552 -0.12093556\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan -0.16938321 -0.13763002 -0.11703902\n", + " -0.16953091 -0.13736586 -0.11695779 -0.16881837 -0.1375676 -0.11694438\n", + " -0.16927898 -0.13748177 -0.11689982 -0.16921265 -0.13757375 -0.11682524\n", + " -0.16915872 -0.13727377 -0.11694336 -0.16939766 -0.13734972 -0.1167447\n", + " -0.16924214 -0.1373768 -0.11674816 -0.16918278 -0.13746085 -0.1169816\n", + " -0.16927003 -0.13740063 -0.1169564 -0.16916501 -0.13752074 -0.11687641\n", + " -0.16928973 -0.13751536 -0.11697948 -0.16934836 -0.13727436 -0.11693615\n", + " -0.16912453 -0.13748699 -0.11693425 -0.1692788 -0.13750784 -0.11694655\n", + " -0.16919354 -0.13747437 -0.11708782 -0.16940009 -0.13757749 -0.11700586\n", + " -0.1692801 -0.13725384 -0.11684394 nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " -0.11606052 -0.1140225 -0.11403709 -0.11627212 -0.1139982 -0.11402075\n", + " -0.11613561 -0.11407941 -0.11420487 -0.11666225 -0.11462523 -0.11431901\n", + " -0.11604817 -0.11456211 -0.11392092 -0.11609343 -0.11394228 -0.11414071\n", + " -0.11611685 -0.11420178 -0.11405459 -0.11594404 -0.11408614 -0.11391662\n", + " -0.11590886 -0.11396465 -0.11389125 -0.11616694 -0.11441846 -0.11417015\n", + " -0.11617368 -0.11429765 -0.1139636 -0.11616763 -0.11433984 -0.11412121\n", + " -0.11625618 -0.11402999 -0.11419791 -0.11613603 -0.114206 -0.11423922\n", + " -0.1160801 -0.11431896 -0.11416734 -0.11608923 -0.11455498 -0.11417448\n", + " -0.11605165 -0.11427773 -0.11392205 -0.11606243 -0.11408421 -0.11395292\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan -0.11281447 -0.11245904 -0.11308822\n", + " -0.11256366 -0.11230094 -0.1130767 -0.11282651 -0.1121034 -0.11283479\n", + " -0.11260704 -0.1125136 -0.11288977 -0.11278304 -0.11242278 -0.11268564\n", + " -0.11263359 -0.11236227 -0.11329411 -0.11231603 -0.1124533 -0.11278826\n", + " -0.11291545 -0.11241223 -0.11250702 -0.11246481 -0.11228665 -0.11348916\n", + " -0.11250694 -0.11250274 -0.11298019 -0.11277323 -0.11248601 -0.11301753\n", + " -0.11259486 -0.1124685 -0.11285441 -0.11274424 -0.11232891 -0.11316456\n", + " -0.11274575 -0.11256149 -0.11252293 -0.11293524 -0.11261757 -0.11305628\n", + " -0.11253063 -0.11237109 -0.11278518 -0.1124074 -0.11276905 -0.11296684\n", + " -0.11258689 -0.11228467 -0.11331342 nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " -0.11292265 -0.11395193 -0.11564599 -0.11244356 -0.11338947 -0.1148266\n", + " -0.11295702 -0.11353862 -0.11510521 -0.11244347 -0.11387967 -0.11512396\n", + " -0.11269802 -0.11364442 -0.1151339 -0.11238356 -0.11364301 -0.11496543\n", + " -0.11229193 -0.11340926 -0.11550744 -0.11215818 -0.11367944 -0.11552889\n", + " -0.11240305 -0.11352309 -0.115412 -0.1128402 -0.11338749 -0.1153551\n", + " -0.11250042 -0.11347275 -0.11548445 -0.11271132 -0.11377527 -0.11558066\n", + " -0.11318598 -0.11325792 -0.11499103 -0.11253099 -0.1129829 -0.11530949\n", + " -0.11239074 -0.11329625 -0.11544761 -0.11262484 -0.11323392 -0.1151936\n", + " -0.11253889 -0.11382403 -0.11511129 -0.11250854 -0.11339898 -0.11536332\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan -0.11542253 -0.11498664 -0.11428517\n", + " -0.11503783 -0.11473447 -0.11458687 -0.11483866 -0.1154254 -0.11479037\n", + " -0.11533015 -0.11515195 -0.11460571 -0.11563491 -0.11433835 -0.11437413\n", + " -0.11510849 -0.11472156 -0.11516494 -0.11545009 -0.115001 -0.11479743\n", + " -0.11461761 -0.11537461 -0.11497109 -0.1155148 -0.11567353 -0.11431184\n", + " -0.11546067 -0.11462564 -0.11450721 -0.11511 -0.11487988 -0.11466523\n", + " -0.11585756 -0.11462611 -0.11433121 -0.11538152 -0.11463425 -0.11527088\n", + " -0.11509145 -0.11493588 -0.11484324 -0.11528905 -0.11426327 -0.11476508\n", + " -0.11499562 -0.11451299 -0.11466765 -0.11525918 -0.11469718 -0.11476983\n", + " -0.11467865 -0.1145067 -0.11479425 nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " -0.11352917 -0.1145882 -0.11643688 -0.11418115 -0.11442858 -0.11635549\n", + " -0.11408502 -0.11458383 -0.1163013 -0.1135842 -0.11453566 -0.11575264\n", + " -0.11341863 -0.11481638 -0.11635685 -0.1132144 -0.11438018 -0.11666005\n", + " -0.11311482 -0.11500883 -0.11594984 -0.11409228 -0.11464061 -0.1158012\n", + " -0.11389399 -0.11454081 -0.1157428 -0.11333869 -0.11438896 -0.11676006\n", + " -0.11382523 -0.11443669 -0.11606569 -0.11424726 -0.11464652 -0.11608159\n", + " -0.11396605 -0.11473188 -0.1167532 -0.1136805 -0.11455875 -0.11615814\n", + " -0.11372286 -0.11442829 -0.11590895 -0.1136509 -0.11368863 -0.11660073\n", + " -0.1136605 -0.1141187 -0.11613806 -0.11326355 -0.11427399 -0.11676148\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan nan nan nan\n", + " nan nan nan -0.11573534 -0.11897501 -0.1226239\n", + " -0.1162633 -0.11939573 -0.12255715 -0.11636411 -0.11878021 -0.12306277\n", + " -0.11535113 -0.11813967 -0.1230085 -0.11594119 -0.11812955 -0.12217928\n", + " -0.11523023 -0.11843291 -0.12228252 -0.1159457 -0.11840108 -0.12181337\n", + " -0.11600134 -0.11790484 -0.12203724 -0.11579998 -0.11787918 -0.12317219\n", + " -0.11578704 -0.11837798 -0.12379234 -0.1155279 -0.11865384 -0.12319867\n", + " -0.11597008 -0.11886814 -0.12291788 -0.1162282 -0.11918752 -0.12363613\n", + " -0.11571473 -0.11805225 -0.12250506 -0.11640247 -0.11823175 -0.1226976\n", + " -0.11571549 -0.11813327 -0.12229009 -0.11621545 -0.11793769 -0.1229533\n", + " -0.11528287 -0.1183919 -0.12121653]\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Лучшие гиперпараметры для Gradient Boosting:\n", + "{'learning_rate': 0.1, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}\n" + ] + } + ], + "source": [ + "\n", + "from sklearn.ensemble import GradientBoostingRegressor\n", + "\n", + "param_grid_gb = {\n", + " 'n_estimators': [50, 100, 200],\n", + " 'learning_rate': [0.01, 0.1, 0.2],\n", + " 'max_depth': [3, 5, 7],\n", + " 'min_samples_split': [2, 5, 10],\n", + " 'min_samples_leaf': [1, 2, 4],\n", + " 'max_features': ['auto', 'sqrt', 'log2']\n", + "}\n", + "\n", + "grid_search_gb = GridSearchCV(\n", + " estimator=GradientBoostingRegressor(),\n", + " param_grid=param_grid_gb,\n", + " cv=5,\n", + " scoring='neg_mean_squared_error',\n", + " n_jobs=-1\n", + ")\n", + "\n", + "grid_search_gb.fit(x_train, y_train)\n", + "\n", + "# Вывод лучших гиперпараметров\n", + "print(\"Лучшие гиперпараметры для Gradient Boosting:\")\n", + "print(grid_search_gb.best_params_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3) Метод k-ближайших соседей" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Лучшие гиперпараметры для k-Nearest Neighbors:\n", + "{'algorithm': 'ball_tree', 'n_neighbors': 10, 'p': 1, 'weights': 'distance'}\n" + ] + } + ], + "source": [ + "from sklearn.neighbors import KNeighborsRegressor\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "param_grid_knn = {\n", + " 'n_neighbors': [3, 5, 7, 10],\n", + " 'weights': ['uniform', 'distance'],\n", + " 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],\n", + " 'p': [1, 2]\n", + "}\n", + "\n", + "grid_search_knn = GridSearchCV(\n", + " estimator=KNeighborsRegressor(),\n", + " param_grid=param_grid_knn,\n", + " cv=5,\n", + " scoring='neg_mean_squared_error',\n", + " n_jobs=-1\n", + ")\n", + "\n", + "grid_search_knn.fit(x_train, y_train)\n", + "\n", + "# Вывод лучших гиперпараметров\n", + "print(\"Лучшие гиперпараметры для k-Nearest Neighbors:\")\n", + "print(grid_search_knn.best_params_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Предсказание на тестовой выборке" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = model.predict(x_test)\n", + "y_pred_forest = model_forest.predict(x_test)\n", + "y_pred_lasso = model_lasso.predict(x_test)\n", + "y_pred_gb = model_gb.predict(x_test)\n", + "y_pred_neighbors = model_knn.predict(x_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Оценка качества модели" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1.\tMSE (Mean Squared Error)\n", + "Среднее значение квадратов разностей между предсказанными и фактическими значениями. Чем меньше значение, тем лучше модель." + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean Squared Error (MSE):\n", + "k-NN: \t\t\t0.213\n", + "Random Forest: \t\t0.118\n", + "Lasso: \t\t\t0.166\n", + "Gradient Boosting: \t0.113\n", + "k-Nearest Neighbors: \t0.326\n" + ] + } + ], + "source": [ + "from sklearn.metrics import mean_squared_error\n", + "import numpy as np\n", + "\n", + "mse1 = mean_squared_error(y_test, y_pred)\n", + "mse2 = mean_squared_error(y_test, y_pred_forest)\n", + "mse3 = mean_squared_error(y_test, y_pred_lasso)\n", + "mse4 = mean_squared_error(y_test, y_pred_gb)\n", + "mse5 = mean_squared_error(y_test, y_pred_neighbors)\n", + "\n", + "mse1_rounded = round(mse1, 3)\n", + "mse2_rounded = round(mse2, 3)\n", + "mse3_rounded = round(mse3, 3)\n", + "mse4_rounded = round(mse4, 3)\n", + "mse5_rounded = round(mse5, 3)\n", + "\n", + "print(\"Mean Squared Error (MSE):\")\n", + "print(f\"k-NN: \\t\\t\\t{mse1_rounded}\")\n", + "print(f\"Random Forest: \\t\\t{mse2_rounded}\")\n", + "print(f\"Lasso: \\t\\t\\t{mse3_rounded}\")\n", + "print(f\"Gradient Boosting: \\t{mse4_rounded}\")\n", + "print(f\"k-Nearest Neighbors: \\t{mse5_rounded}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2.\tMAE\n", + "Среднее значение абсолютных разностей между предсказанными и фактическими значениями. Чем меньше значение, тем лучше модель." + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean Absolute Error (MAE):\n", + "k-NN: \t\t\t0.213\n", + "Random Forest: \t\t0.238\n", + "Lasso: \t\t\t0.366\n", + "Gradient Boosting: \t0.246\n", + "k-Nearest Neighbors: \t0.485\n" + ] + } + ], + "source": [ + "from sklearn.metrics import mean_absolute_error\n", + "\n", + "mae1 = round(mean_absolute_error(y_test, y_pred),3)\n", + "mae2 = round(mean_absolute_error(y_test, y_pred_forest),3)\n", + "mae3 = round(mean_absolute_error(y_test, y_pred_lasso),3)\n", + "mae4 = round(mean_absolute_error(y_test, y_pred_gb),3)\n", + "mae5 = round(mean_absolute_error(y_test, y_pred_neighbors),3)\n", + "print(\"Mean Absolute Error (MAE):\")\n", + "print(f\"k-NN: \\t\\t\\t{mae1}\")\n", + "print(f\"Random Forest: \\t\\t{mae2}\")\n", + "print(f\"Lasso: \\t\\t\\t{mae3}\")\n", + "print(f\"Gradient Boosting: \\t{mae4}\")\n", + "print(f\"k-Nearest Neighbors: \\t{mae5}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.\tR-squared\n", + "Мера, показывающая, насколько хорошо модель объясняет изменчивость данных. Значение находится в диапазоне от 0 до 1, где 1 — идеальное соответствие, а 0 — модель не объясняет данные." + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "R² (R-squared): 0.127933821917115\n", + "\n", + "R² (R-squared):\n", + "k-NN: \t\t\t0.128\n", + "Random Forest: \t\t0.515\n", + "Lasso: \t\t\t0.319\n", + "Gradient Boosting: \t0.537\n", + "k-Nearest Neighbors: \t-0.337\n" + ] + } + ], + "source": [ + "from sklearn.metrics import r2_score\n", + "r2 = r2_score(y_test, y_pred)\n", + "print(f\"R² (R-squared): {r2}\")\n", + "\n", + "r2_1 = r2_score(y_test, y_pred)\n", + "r2_2 = r2_score(y_test, y_pred_forest)\n", + "r2_3 = r2_score(y_test, y_pred_lasso)\n", + "r2_4 = r2_score(y_test, y_pred_gb)\n", + "r2_5 = r2_score(y_test, y_pred_neighbors)\n", + "\n", + "r2_1_rounded = round(r2_1, 3)\n", + "r2_2_rounded = round(r2_2, 3)\n", + "r2_3_rounded = round(r2_3, 3)\n", + "r2_4_rounded = round(r2_4, 3)\n", + "r2_5_rounded = round(r2_5, 3)\n", + "\n", + "print(\"\\nR² (R-squared):\")\n", + "print(f\"k-NN: \\t\\t\\t{r2_1_rounded}\")\n", + "print(f\"Random Forest: \\t\\t{r2_2_rounded}\")\n", + "print(f\"Lasso: \\t\\t\\t{r2_3_rounded}\")\n", + "print(f\"Gradient Boosting: \\t{r2_4_rounded}\")\n", + "print(f\"k-Nearest Neighbors: \\t{r2_5_rounded}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "4.\tRMSE\n", + " Среднее отклонение предсказаний от реальных данных. Чем меньше модуль, тем лучше модель." + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Root Mean Squared Error (RMSE):\n", + "k-NN: \t\t\t0.461\n", + "Random Forest: \t\t0.344\n", + "Lasso: \t\t\t0.407\n", + "Gradient Boosting: \t0.336\n", + "k-Nearest Neighbors: \t0.571\n" + ] + } + ], + "source": [ + "rmse1 = np.sqrt(mse1)\n", + "rmse2 = np.sqrt(mse2)\n", + "rmse3 = np.sqrt(mse3)\n", + "rmse4 = np.sqrt(mse4)\n", + "rmse5 = np.sqrt(mse5)\n", + "\n", + "rmse1_rounded = round(rmse1, 3)\n", + "rmse2_rounded = round(rmse2, 3)\n", + "rmse3_rounded = round(rmse3, 3)\n", + "rmse4_rounded = round(rmse4, 3)\n", + "rmse5_rounded = round(rmse5, 3)\n", + "\n", + "print(\"Root Mean Squared Error (RMSE):\")\n", + "print(f\"k-NN: \\t\\t\\t{rmse1_rounded}\")\n", + "print(f\"Random Forest: \\t\\t{rmse2_rounded}\")\n", + "print(f\"Lasso: \\t\\t\\t{rmse3_rounded}\")\n", + "print(f\"Gradient Boosting: \\t{rmse4_rounded}\")\n", + "print(f\"k-Nearest Neighbors: \\t{rmse5_rounded}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Лучший результат – градиентный бустинг и случайный лес.\n", + "Положительные результаты по всем критериям получил случайный лес. Три из четырех положительных результата у градиентного бустинга. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Значит, случайный лес – наиболее точная и устойчивая стратегия обучения модели. Итоговая модель – model_forest." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Также, с помощью применение важности признаков (feature importance) на Случайном лесе, мы вывели основные факторы, вызывающие депрессию:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Feature Importance\n", + "13 Have you ever had suicidal thoughts ? 0.300542\n", + "5 Academic Pressure 0.134276\n", + "0 id 0.087970\n", + "7 CGPA 0.079078\n", + "2 Age 0.066613\n", + "15 Financial Stress 0.066330\n", + "3 City 0.059293\n", + "14 Work/Study Hours 0.052275\n", + "12 Degree 0.049539\n", + "8 Study Satisfaction 0.032944\n", + "11 Dietary Habits 0.026140\n", + "10 Sleep Duration 0.024435\n", + "16 Family History of Mental Illness 0.010547\n", + "1 Gender 0.009627\n", + "4 Profession 0.000372\n", + "9 Job Satisfaction 0.000017\n", + "6 Work Pressure 0.000003\n" + ] + } + ], + "source": [ + "from sklearn.ensemble import RandomForestRegressor\n", + "\n", + "model_rf = RandomForestRegressor(n_estimators=100, random_state=42)\n", + "model_rf.fit(x_train, y_train)\n", + "\n", + "feature_importances = model_rf.feature_importances_\n", + "\n", + "import pandas as pd\n", + "feature_importance_df = pd.DataFrame({\n", + " 'Feature': x.columns,\n", + " 'Importance': feature_importances\n", + "}).sort_values(by='Importance', ascending=False)\n", + "\n", + "print(feature_importance_df)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Scripts", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 2fca9fd006456c562db7b2491925dd3fe634ded8 Mon Sep 17 00:00:00 2001 From: dex_moth Date: Sat, 21 Dec 2024 01:02:44 +0400 Subject: [PATCH 6/7] lab 5 --- lab_5/Lab5.ipynb | 336 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 336 insertions(+) create mode 100644 lab_5/Lab5.ipynb diff --git a/lab_5/Lab5.ipynb b/lab_5/Lab5.ipynb new file mode 100644 index 0000000..fabdfd0 --- /dev/null +++ b/lab_5/Lab5.ipynb @@ -0,0 +1,336 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Лабораторная 5" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['stock index', 'country', 'year', 'index price', 'log_indexprice',\n", + " 'inflationrate', 'oil prices', 'exchange_rate', 'gdppercent',\n", + " 'percapitaincome', 'unemploymentrate', 'manufacturingoutput',\n", + " 'tradebalance', 'USTreasury'],\n", + " dtype='object')\n", + " stock index country year index price log_indexprice \\\n", + "0 NASDAQ United States of America 1980.0 168.61 2.23 \n", + "1 NASDAQ United States of America 1981.0 203.15 2.31 \n", + "2 NASDAQ United States of America 1982.0 188.98 2.28 \n", + "3 NASDAQ United States of America 1983.0 285.43 2.46 \n", + "4 NASDAQ United States of America 1984.0 248.89 2.40 \n", + "\n", + " inflationrate oil prices exchange_rate gdppercent percapitaincome \\\n", + "0 0.14 21.59 1.0 0.09 12575.0 \n", + "1 0.10 31.77 1.0 0.12 13976.0 \n", + "2 0.06 28.52 1.0 0.04 14434.0 \n", + "3 0.03 26.19 1.0 0.09 15544.0 \n", + "4 0.04 25.88 1.0 0.11 17121.0 \n", + "\n", + " unemploymentrate manufacturingoutput tradebalance USTreasury \n", + "0 0.07 NaN -13.06 0.11 \n", + "1 0.08 NaN -12.52 0.14 \n", + "2 0.10 NaN -19.97 0.13 \n", + "3 0.10 NaN -51.64 0.11 \n", + "4 0.08 NaN -102.73 0.12 \n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.cluster import KMeans\n", + "from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n", + "from sklearn.metrics import silhouette_score\n", + "\n", + "df = pd.read_csv(\".//csv//EconomicData.csv\")\n", + "print(df.columns)\n", + "print(df.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Бизнес-цель: сегментировать страны на основе экономических показателей для определения схожих групп стран и последующего анализа каждой группы." + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Данные содержат текстовые значения.\n", + "Исходный размер датасета: 369\n", + "Очищенный размер датасета: 219\n" + ] + } + ], + "source": [ + "df = df.copy()\n", + "df_clean = df.dropna()\n", + "\n", + "if not np.issubdtype(df_clean.dtypes.iloc[1], np.number):\n", + " print(\"Данные содержат текстовые значения.\")\n", + " cleaned_data = df_clean.select_dtypes(include=[np.number])\n", + "\n", + "print(f\"Исходный размер датасета: {df.shape[0]}\")\n", + "print(f\"Очищенный размер датасета: {df_clean.shape[0]}\")\n", + "\n", + "df = pd.get_dummies(df_clean, columns=['country'], drop_first=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "# Выбор признаков для кластеризации\n", + "features = ['index price', 'inflationrate', 'oil prices', 'exchange_rate', 'gdppercent', \n", + " 'percapitaincome', 'unemploymentrate', 'manufacturingoutput', 'tradebalance', 'USTreasury']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Стандартизируем, чтобы устранить влияние масштаба.\n", + "А также понизим размерность с помощью РСА для уменьшения количества признаков для визуализации данных." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "# Предобработка данных: стандартизация\n", + "scaler = StandardScaler()\n", + "scaled_data = scaler.fit_transform(df[features])\n", + "\n", + "# Понижение размерности с помощью PCA\n", + "pca = PCA(n_components=2)\n", + "pca_data = pca.fit_transform(scaled_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Используем метод главных компонент (PCA) для уменьшения размерности данных до 2D для визуализации." + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Неиерархическая\n", + "kmeans = KMeans(n_clusters=3, random_state=42)\n", + "kmeans_labels = kmeans.fit_predict(scaled_data)\n", + "\n", + "# Визуализация кластеров K-Means\n", + "plt.figure(figsize=(10, 6))\n", + "sns.scatterplot(x=pca_data[:, 0], y=pca_data[:, 1], hue=kmeans_labels, palette='inferno', s=100)\n", + "plt.title('K-Means Clustering')\n", + "plt.xlabel('Количество кластеров')\n", + "plt.ylabel('Инерция')\n", + "plt.show()\n", + "\n", + "#оценка неиерархического\n", + "silhouette_avg_kmeans = silhouette_score(scaled_data, kmeans_labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Для иерархической кластеризации потребуется предварительно определить количество кластеров, так как она не возвращает метки кластеров." + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Иерархическая кластеризация\n", + "linked = linkage(scaled_data, 'ward')\n", + "\n", + "# Визуализация\n", + "plt.figure(figsize=(10, 7))\n", + "dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)\n", + "plt.title('Иерархическая')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "# Определение меток\n", + "n_clusters = 3\n", + "hierarchical_labels = fcluster(linked, n_clusters, criterion='maxclust')" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + " # Визуализация кластеров\n", + "plt.figure(figsize=(10, 6))\n", + "sns.scatterplot(x=pca_data[:, 0], y=pca_data[:, 1], hue=hierarchical_labels, palette='inferno', s=100)\n", + "plt.title('Иерархическая')\n", + "plt.xlabel('PC1')\n", + "plt.ylabel('PC2')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " year index price log_indexprice inflationrate oil prices \\\n", + "Cluster \n", + "0 1998.054545 13563.522364 3.929091 0.054182 34.765091 \n", + "1 2005.619048 7237.508776 3.713401 0.020680 48.031361 \n", + "2 2009.294118 3554.822941 3.494118 0.022941 59.845294 \n", + "\n", + " exchange_rate gdppercent percapitaincome unemploymentrate \\\n", + "Cluster \n", + "0 85.857273 0.045818 7502.927273 0.061818 \n", + "1 6.610340 0.029320 27037.510204 0.077823 \n", + "2 1.000000 0.025294 49157.352941 0.058235 \n", + "\n", + " manufacturingoutput tradebalance USTreasury \n", + "Cluster \n", + "0 132.100000 -6.739455 0.063636 \n", + "1 473.491633 34.495510 0.042993 \n", + "2 251.887059 -555.851765 0.035294 \n" + ] + } + ], + "source": [ + "# Добавление меток кластеров в исходный датафрейм\n", + "df['Cluster'] = kmeans_labels\n", + "\n", + "# Удаление нечисловых столбцов перед вычислением среднего\n", + "numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns\n", + "cluster_analysis = df.groupby('Cluster')[numeric_columns].mean()\n", + "\n", + "# Вывод результата\n", + "print(cluster_analysis)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Оценка для неиерархического: 0.20251\n", + "Оценка для иерархического: 0.20251\n" + ] + } + ], + "source": [ + "# Оценка\n", + "print(f\"Оценка для неиерархического: {round(silhouette_avg_kmeans,5)}\")\n", + "\n", + "silhouette_avg = silhouette_score(scaled_data, kmeans_labels)\n", + "print(f\"Оценка для иерархического: {round(silhouette_avg,5)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Оценки совпадают, потому что, вероятно, для расхождения в оценке нужно большее число различных данных." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Scripts", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 3863672121382f4b167c410aef759d36b4d10b11 Mon Sep 17 00:00:00 2001 From: dex_moth Date: Sat, 21 Dec 2024 12:33:06 +0400 Subject: [PATCH 7/7] correct lab4 --- lab_4/Lab4.ipynb | 256 +++++++++++++---------------------------------- 1 file changed, 70 insertions(+), 186 deletions(-) diff --git a/lab_4/Lab4.ipynb b/lab_4/Lab4.ipynb index 0b8116e..62032e7 100644 --- a/lab_4/Lab4.ipynb +++ b/lab_4/Lab4.ipynb @@ -23,6 +23,13 @@ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from matplotlib.ticker import FuncFormatter\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.linear_model import Lasso\n", + "from sklearn.ensemble import GradientBoostingRegressor\n", + "from sklearn.neighbors import KNeighborsRegressor\n", "\n", "df = pd.read_csv(\".//csv//Student Depression Dataset.csv\")\n", "print(df.columns)" @@ -293,6 +300,50 @@ "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Создание конвейера\n", + "\n", + "# Обработаем данные\n", + "# Определим категориальные и числовые признаки\n", + "categorical_features = ['Gender', 'City', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Profession', 'Family History of Mental Illness', 'Sleep Duration']\n", + "numerical_features = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress']\n", + "\n", + "categorical_transformer = Pipeline(steps=[\n", + " ('onehot', OneHotEncoder(handle_unknown='ignore'))\n", + "])\n", + "\n", + "numerical_transformer = Pipeline(steps=[\n", + " ('scaler', StandardScaler())\n", + "])\n", + "\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " ('num', numerical_transformer, numerical_features),\n", + " ('cat', categorical_transformer, categorical_features)\n", + " ])\n", + "\n", + "# Построим модели\n", + "pipeline_lasso = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('model', Lasso())\n", + "])\n", + "\n", + "pipeline_gb = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('model', GradientBoostingRegressor())\n", + "])\n", + "\n", + "pipeline_knn = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('model', KNeighborsRegressor())\n", + "])" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -302,7 +353,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -310,7 +361,7 @@ "output_type": "stream", "text": [ "Лучшие гиперпараметры для Lasso:\n", - "{'alpha': 0.01, 'fit_intercept': False}\n" + "{'model__alpha': 0.01, 'model__fit_intercept': False}\n" ] } ], @@ -318,8 +369,8 @@ "from sklearn.linear_model import Lasso\n", "\n", "param_grid_lasso = {\n", - " 'alpha': [0.01, 0.1, 1.0, 10.0],\n", - " 'fit_intercept': [True, False],\n", + " 'model__alpha': [0.01, 0.1, 1.0, 10.0],\n", + " 'model__fit_intercept': [True, False],\n", "}\n", "\n", "# Создание объекта GridSearchCV\n", @@ -347,193 +398,28 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 2, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:540: FitFailedWarning: \n", - "1215 fits failed out of a total of 3645.\n", - "The score on these train-test partitions for these parameters will be set to nan.\n", - "If these failures are not expected, you can try to debug them by setting error_score='raise'.\n", - "\n", - "Below are more details about the failures:\n", - "--------------------------------------------------------------------------------\n", - "978 fits failed with the following error:\n", - "Traceback (most recent call last):\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 888, in _fit_and_score\n", - " estimator.fit(X_train, y_train, **fit_params)\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 1466, in wrapper\n", - " estimator._validate_params()\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 666, in _validate_params\n", - " validate_parameter_constraints(\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py\", line 95, in validate_parameter_constraints\n", - " raise InvalidParameterError(\n", - "sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of GradientBoostingRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto' instead.\n", - "\n", - "--------------------------------------------------------------------------------\n", - "237 fits failed with the following error:\n", - "Traceback (most recent call last):\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 888, in _fit_and_score\n", - " estimator.fit(X_train, y_train, **fit_params)\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 1466, in wrapper\n", - " estimator._validate_params()\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\base.py\", line 666, in _validate_params\n", - " validate_parameter_constraints(\n", - " File \"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py\", line 95, in validate_parameter_constraints\n", - " raise InvalidParameterError(\n", - "sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of GradientBoostingRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.\n", - "\n", - " warnings.warn(some_fits_failed_message, FitFailedWarning)\n", - "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\numpy\\ma\\core.py:2881: RuntimeWarning: invalid value encountered in cast\n", - " _data = np.array(data, dtype=dtype, copy=copy,\n", - "e:\\AIM1.5\\Scripts\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:1103: UserWarning: One or more of the test scores are non-finite: [ nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan -0.18767441 -0.15799837 -0.13080278\n", - " -0.18762913 -0.15792709 -0.13056114 -0.18792038 -0.15737146 -0.130218\n", - " -0.18725961 -0.157967 -0.13047453 -0.18766583 -0.15779565 -0.13094863\n", - " -0.18798705 -0.15693978 -0.13061215 -0.18766317 -0.15746848 -0.13072918\n", - " -0.18864158 -0.15666133 -0.13095037 -0.18817206 -0.15805489 -0.13086126\n", - " -0.18707465 -0.15864932 -0.13104947 -0.18818902 -0.15828572 -0.13063871\n", - " -0.18701628 -0.15853864 -0.13019458 -0.18740927 -0.15836397 -0.13065455\n", - " -0.18768748 -0.15828297 -0.1309458 -0.18845004 -0.15696395 -0.13023062\n", - " -0.18754854 -0.15899615 -0.13061707 -0.18831427 -0.15819939 -0.13096524\n", - " -0.18662963 -0.15815869 -0.13089186 nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " -0.1758914 -0.1442684 -0.12093344 -0.1758927 -0.14423731 -0.12084543\n", - " -0.17573339 -0.14419842 -0.12076166 -0.17512045 -0.14435454 -0.1207299\n", - " -0.17669645 -0.14397965 -0.12087019 -0.17605424 -0.1438664 -0.12091068\n", - " -0.17582192 -0.1443651 -0.12097165 -0.17588422 -0.14421003 -0.12081764\n", - " -0.17522742 -0.14424357 -0.12086484 -0.17530986 -0.14433713 -0.12091757\n", - " -0.17565647 -0.14408902 -0.12075918 -0.17561884 -0.14426355 -0.12094066\n", - " -0.17522371 -0.1439869 -0.12099023 -0.17619772 -0.14396131 -0.12079667\n", - " -0.17710789 -0.1448419 -0.12087822 -0.17608534 -0.14416684 -0.12087865\n", - " -0.1754675 -0.1442258 -0.12068226 -0.17611334 -0.14433552 -0.12093556\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan -0.16938321 -0.13763002 -0.11703902\n", - " -0.16953091 -0.13736586 -0.11695779 -0.16881837 -0.1375676 -0.11694438\n", - " -0.16927898 -0.13748177 -0.11689982 -0.16921265 -0.13757375 -0.11682524\n", - " -0.16915872 -0.13727377 -0.11694336 -0.16939766 -0.13734972 -0.1167447\n", - " -0.16924214 -0.1373768 -0.11674816 -0.16918278 -0.13746085 -0.1169816\n", - " -0.16927003 -0.13740063 -0.1169564 -0.16916501 -0.13752074 -0.11687641\n", - " -0.16928973 -0.13751536 -0.11697948 -0.16934836 -0.13727436 -0.11693615\n", - " -0.16912453 -0.13748699 -0.11693425 -0.1692788 -0.13750784 -0.11694655\n", - " -0.16919354 -0.13747437 -0.11708782 -0.16940009 -0.13757749 -0.11700586\n", - " -0.1692801 -0.13725384 -0.11684394 nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " -0.11606052 -0.1140225 -0.11403709 -0.11627212 -0.1139982 -0.11402075\n", - " -0.11613561 -0.11407941 -0.11420487 -0.11666225 -0.11462523 -0.11431901\n", - " -0.11604817 -0.11456211 -0.11392092 -0.11609343 -0.11394228 -0.11414071\n", - " -0.11611685 -0.11420178 -0.11405459 -0.11594404 -0.11408614 -0.11391662\n", - " -0.11590886 -0.11396465 -0.11389125 -0.11616694 -0.11441846 -0.11417015\n", - " -0.11617368 -0.11429765 -0.1139636 -0.11616763 -0.11433984 -0.11412121\n", - " -0.11625618 -0.11402999 -0.11419791 -0.11613603 -0.114206 -0.11423922\n", - " -0.1160801 -0.11431896 -0.11416734 -0.11608923 -0.11455498 -0.11417448\n", - " -0.11605165 -0.11427773 -0.11392205 -0.11606243 -0.11408421 -0.11395292\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan -0.11281447 -0.11245904 -0.11308822\n", - " -0.11256366 -0.11230094 -0.1130767 -0.11282651 -0.1121034 -0.11283479\n", - " -0.11260704 -0.1125136 -0.11288977 -0.11278304 -0.11242278 -0.11268564\n", - " -0.11263359 -0.11236227 -0.11329411 -0.11231603 -0.1124533 -0.11278826\n", - " -0.11291545 -0.11241223 -0.11250702 -0.11246481 -0.11228665 -0.11348916\n", - " -0.11250694 -0.11250274 -0.11298019 -0.11277323 -0.11248601 -0.11301753\n", - " -0.11259486 -0.1124685 -0.11285441 -0.11274424 -0.11232891 -0.11316456\n", - " -0.11274575 -0.11256149 -0.11252293 -0.11293524 -0.11261757 -0.11305628\n", - " -0.11253063 -0.11237109 -0.11278518 -0.1124074 -0.11276905 -0.11296684\n", - " -0.11258689 -0.11228467 -0.11331342 nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " -0.11292265 -0.11395193 -0.11564599 -0.11244356 -0.11338947 -0.1148266\n", - " -0.11295702 -0.11353862 -0.11510521 -0.11244347 -0.11387967 -0.11512396\n", - " -0.11269802 -0.11364442 -0.1151339 -0.11238356 -0.11364301 -0.11496543\n", - " -0.11229193 -0.11340926 -0.11550744 -0.11215818 -0.11367944 -0.11552889\n", - " -0.11240305 -0.11352309 -0.115412 -0.1128402 -0.11338749 -0.1153551\n", - " -0.11250042 -0.11347275 -0.11548445 -0.11271132 -0.11377527 -0.11558066\n", - " -0.11318598 -0.11325792 -0.11499103 -0.11253099 -0.1129829 -0.11530949\n", - " -0.11239074 -0.11329625 -0.11544761 -0.11262484 -0.11323392 -0.1151936\n", - " -0.11253889 -0.11382403 -0.11511129 -0.11250854 -0.11339898 -0.11536332\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan -0.11542253 -0.11498664 -0.11428517\n", - " -0.11503783 -0.11473447 -0.11458687 -0.11483866 -0.1154254 -0.11479037\n", - " -0.11533015 -0.11515195 -0.11460571 -0.11563491 -0.11433835 -0.11437413\n", - " -0.11510849 -0.11472156 -0.11516494 -0.11545009 -0.115001 -0.11479743\n", - " -0.11461761 -0.11537461 -0.11497109 -0.1155148 -0.11567353 -0.11431184\n", - " -0.11546067 -0.11462564 -0.11450721 -0.11511 -0.11487988 -0.11466523\n", - " -0.11585756 -0.11462611 -0.11433121 -0.11538152 -0.11463425 -0.11527088\n", - " -0.11509145 -0.11493588 -0.11484324 -0.11528905 -0.11426327 -0.11476508\n", - " -0.11499562 -0.11451299 -0.11466765 -0.11525918 -0.11469718 -0.11476983\n", - " -0.11467865 -0.1145067 -0.11479425 nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " -0.11352917 -0.1145882 -0.11643688 -0.11418115 -0.11442858 -0.11635549\n", - " -0.11408502 -0.11458383 -0.1163013 -0.1135842 -0.11453566 -0.11575264\n", - " -0.11341863 -0.11481638 -0.11635685 -0.1132144 -0.11438018 -0.11666005\n", - " -0.11311482 -0.11500883 -0.11594984 -0.11409228 -0.11464061 -0.1158012\n", - " -0.11389399 -0.11454081 -0.1157428 -0.11333869 -0.11438896 -0.11676006\n", - " -0.11382523 -0.11443669 -0.11606569 -0.11424726 -0.11464652 -0.11608159\n", - " -0.11396605 -0.11473188 -0.1167532 -0.1136805 -0.11455875 -0.11615814\n", - " -0.11372286 -0.11442829 -0.11590895 -0.1136509 -0.11368863 -0.11660073\n", - " -0.1136605 -0.1141187 -0.11613806 -0.11326355 -0.11427399 -0.11676148\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan nan nan nan\n", - " nan nan nan -0.11573534 -0.11897501 -0.1226239\n", - " -0.1162633 -0.11939573 -0.12255715 -0.11636411 -0.11878021 -0.12306277\n", - " -0.11535113 -0.11813967 -0.1230085 -0.11594119 -0.11812955 -0.12217928\n", - " -0.11523023 -0.11843291 -0.12228252 -0.1159457 -0.11840108 -0.12181337\n", - " -0.11600134 -0.11790484 -0.12203724 -0.11579998 -0.11787918 -0.12317219\n", - " -0.11578704 -0.11837798 -0.12379234 -0.1155279 -0.11865384 -0.12319867\n", - " -0.11597008 -0.11886814 -0.12291788 -0.1162282 -0.11918752 -0.12363613\n", - " -0.11571473 -0.11805225 -0.12250506 -0.11640247 -0.11823175 -0.1226976\n", - " -0.11571549 -0.11813327 -0.12229009 -0.11621545 -0.11793769 -0.1229533\n", - " -0.11528287 -0.1183919 -0.12121653]\n", - " warnings.warn(\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ "Лучшие гиперпараметры для Gradient Boosting:\n", - "{'learning_rate': 0.1, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}\n" + "{'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 2, 'model__min_samples_split': 5, 'model__n_estimators': 100}\n" ] } ], "source": [ - "\n", "from sklearn.ensemble import GradientBoostingRegressor\n", "\n", "param_grid_gb = {\n", - " 'n_estimators': [50, 100, 200],\n", - " 'learning_rate': [0.01, 0.1, 0.2],\n", - " 'max_depth': [3, 5, 7],\n", - " 'min_samples_split': [2, 5, 10],\n", - " 'min_samples_leaf': [1, 2, 4],\n", - " 'max_features': ['auto', 'sqrt', 'log2']\n", + " 'model__n_estimators': [50, 100, 200],\n", + " 'model__learning_rate': [0.01, 0.1, 0.2],\n", + " 'model__max_depth': [3, 5, 7],\n", + " 'model__min_samples_split': [2, 5, 10],\n", + " 'model__min_samples_leaf': [1, 2, 4],\n", + " 'model__max_features': ['auto', 'sqrt', 'log2']\n", "}\n", "\n", "grid_search_gb = GridSearchCV(\n", @@ -577,10 +463,10 @@ "from sklearn.model_selection import GridSearchCV\n", "\n", "param_grid_knn = {\n", - " 'n_neighbors': [3, 5, 7, 10],\n", - " 'weights': ['uniform', 'distance'],\n", - " 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],\n", - " 'p': [1, 2]\n", + " 'model__n_neighbors': [3, 5, 7, 10],\n", + " 'model__weights': ['uniform', 'distance'],\n", + " 'model__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],\n", + " 'model__p': [1, 2]\n", "}\n", "\n", "grid_search_knn = GridSearchCV(\n", @@ -611,11 +497,9 @@ "metadata": {}, "outputs": [], "source": [ - "y_pred = model.predict(x_test)\n", - "y_pred_forest = model_forest.predict(x_test)\n", - "y_pred_lasso = model_lasso.predict(x_test)\n", - "y_pred_gb = model_gb.predict(x_test)\n", - "y_pred_neighbors = model_knn.predict(x_test)" + "y_pred_lasso = grid_search_lasso.predict(x_test)\n", + "y_pred_forest = grid_search_gb.predict(x_test)\n", + "y_pred_neighbors = grid_search_knn.predict(x_test)" ] }, {