diff --git a/lab_4/lab_4.ipynb b/lab_4/lab_4.ipynb new file mode 100644 index 0000000..43d4dd4 --- /dev/null +++ b/lab_4/lab_4.ipynb @@ -0,0 +1,6625 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Вариант 19: Данные о миллионерах" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['Rank ', 'Name', 'Networth', 'Age', 'Country', 'Source', 'Industry'], dtype='object')\n" + ] + } + ], + "source": [ + "import pandas as pd \n", + "df = pd.read_csv(\"..//static//csv//Forbes Billionaires.csv\")\n", + "print(df.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Бизнес-цели\n", + "\n", + "### Задача классификации\n", + "Классифицировать людей по уровню состояния.\n", + "\n", + "Цель:\n", + "\n", + "Разработать модель машинного обучения, которая будет классифицировать миллиардеров по состоянию, выше или ниже среднего.\n", + "\n", + "В обучении модели машинного обучения для классификации миллиардеров по уровню богатства, помимо чистого состояния, используются и другие столбцы данных:\n", + "- Возраст: Люди с высоким чистым состоянием, как правило, старше. Модель может использовать возраст как признак, чтобы прогнозировать уровень богатства.\n", + "- Страна: Богатство распределяется неравномерно по миру. Страна проживания может быть важным признаком для предсказания уровня богатства.\n", + "- Отрасль: Определенные отрасли (например, финансы, технологии) часто связаны с высоким чистым состоянием. \n", + "\n", + "### Задача регрессии:\n", + "Прогнозирование чистого состояния (Networth):\n", + "\n", + "Цель: Предсказать абсолютное значение чистого состояния миллиардера, используя информацию из имеющихся данных.\n", + "\n", + "Применение: Это может быть полезно для оценки потенциального состояния миллиардеров в будущем или для сравнения миллиардеров в разных странах и отраслях.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Определение достижимого уровня качества модели для первой задачи " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Создание целевой переменной и предварительная обработка данных" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Среднее значение поля 'Networth': 4.8607499999999995\n", + " Rank Name Networth Age Country \\\n", + "0 1 Elon Musk 219.0 50 United States \n", + "1 2 Jeff Bezos 171.0 58 United States \n", + "2 3 Bernard Arnault & family 158.0 73 France \n", + "3 4 Bill Gates 129.0 66 United States \n", + "4 5 Warren Buffett 118.0 91 United States \n", + "\n", + " Source Industry above_average_networth \n", + "0 Tesla, SpaceX Automotive 1 \n", + "1 Amazon Technology 1 \n", + "2 LVMH Fashion & Retail 1 \n", + "3 Microsoft Technology 1 \n", + "4 Berkshire Hathaway Finance & Investments 1 \n" + ] + } + ], + "source": [ + "from sklearn import set_config\n", + "\n", + "# Установим параметры для вывода\n", + "set_config(transform_output=\"pandas\")\n", + "\n", + "# Устанавливаем случайное состояние\n", + "random_state = 42\n", + "# Можно использовать данные о above_average_networth для анализа зависимости между типом источника богатства и чистым состоянием.\n", + "# Рассчитываем среднее значение чистого состояния\n", + "average_networth = df['Networth'].mean()\n", + "print(f\"Среднее значение поля 'Networth': {average_networth}\")\n", + "\n", + "# Создаем новую переменную, указывающую, превышает ли чистое состояние среднее\n", + "df['above_average_networth'] = (df['Networth'] > average_networth).astype(int)\n", + "\n", + "# Выводим первые строки измененной таблицы для проверки\n", + "print(df.head())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации\n", + "\n", + "Целевой признак -- above_average_networth " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "X_train shape: (2080, 8)\n", + "y_train shape: (2080, 1)\n", + "X_test shape: (520, 8)\n", + "y_test shape: (520, 1)\n", + "X_train:\n", + " Rank Name Networth Age Country Source \\\n", + "2125 2076 Yogesh Kothari 1.4 73 India specialty chemicals \n", + "1165 1163 Yvonne Bauer 2.7 45 Germany magazines, media \n", + "397 398 Juergen Blickle 6.4 75 Germany auto parts \n", + "1432 1397 Alexander Svetakov 2.2 54 Russia real estate \n", + "1024 1012 Li Min 3.0 56 China semiconductor \n", + "\n", + " Industry above_average_networth \n", + "2125 Manufacturing 0 \n", + "1165 Media & Entertainment 0 \n", + "397 Manufacturing 1 \n", + "1432 Finance & Investments 0 \n", + "1024 Technology 0 \n", + "y_train:\n", + " above_average_networth\n", + "2125 0\n", + "1165 0\n", + "397 1\n", + "1432 0\n", + "1024 0\n", + "X_test:\n", + " Rank Name Networth Age Country \\\n", + "2437 2324 Horst Wortmann 1.2 80 Germany \n", + "2118 2076 Ramesh Juneja 1.4 66 India \n", + "1327 1292 Teresita Sy-Coson 2.4 71 Philippines \n", + "2063 1929 Myron Wentz 1.5 82 St. Kitts and Nevis \n", + "1283 1238 Suh Kyung-bae 2.5 59 South Korea \n", + "\n", + " Source Industry above_average_networth \n", + "2437 footwear Fashion & Retail 0 \n", + "2118 pharmaceuticals Healthcare 0 \n", + "1327 diversified diversified 0 \n", + "2063 health products Fashion & Retail 0 \n", + "1283 cosmetics Fashion & Retail 0 \n", + "y_test:\n", + " above_average_networth\n", + "2437 0\n", + "2118 0\n", + "1327 0\n", + "2063 0\n", + "1283 0\n" + ] + } + ], + "source": [ + "from typing import Tuple\n", + "from pandas import DataFrame\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "def split_stratified_into_train_val_test(\n", + " df_input,\n", + " stratify_colname=\"y\",\n", + " frac_train=0.6,\n", + " frac_val=0.15,\n", + " frac_test=0.25,\n", + " random_state=None,\n", + ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n", + "\n", + "\n", + " if frac_train + frac_val + frac_test != 1.0:\n", + " raise ValueError(\n", + " \"fractions %f, %f, %f do not add up to 1.0\"\n", + " % (frac_train, frac_val, frac_test)\n", + " )\n", + " if stratify_colname not in df_input.columns:\n", + " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", + " X = df_input # Contains all columns.\n", + " y = df_input[\n", + " [stratify_colname]\n", + " ] # Dataframe of just the column on which to stratify.\n", + " # Split original dataframe into train and temp dataframes.\n", + " df_train, df_temp, y_train, y_temp = train_test_split(\n", + " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", + " )\n", + " if frac_val <= 0:\n", + " assert len(df_input) == len(df_train) + len(df_temp)\n", + " return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n", + " # Split the temp dataframe into val and test dataframes.\n", + " relative_frac_test = frac_test / (frac_val + frac_test)\n", + " df_val, df_test, y_val, y_test = train_test_split(\n", + " df_temp,\n", + " y_temp,\n", + " stratify=y_temp,\n", + " test_size=relative_frac_test,\n", + " random_state=random_state,\n", + " )\n", + " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", + " return df_train, df_val, df_test, y_train, y_val, y_test\n", + "\n", + "# Разделение набора данных на обучающую, валидационную и тестовую выборки (80/0/20)\n", + "random_state = 42 # Задайте любое целое число для воспроизводимости\n", + "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n", + " df, stratify_colname=\"above_average_networth\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=random_state\n", + ")\n", + "\n", + "# Вывод размеров выборок\n", + "print(\"X_train shape:\", X_train.shape)\n", + "print(\"y_train shape:\", y_train.shape)\n", + "print(\"X_test shape:\", X_test.shape)\n", + "print(\"y_test shape:\", y_test.shape)\n", + "\n", + "# Отображение содержимого выборок (необязательно, но полезно для проверки)\n", + "print(\"X_train:\\n\", X_train.head())\n", + "print(\"y_train:\\n\", y_train.head())\n", + "print(\"X_test:\\n\", X_test.head())\n", + "print(\"y_test:\\n\", y_test.head())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Формирование конвейера для классификации данных\n", + "\n", + "preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n", + "\n", + "preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n", + "\n", + "features_preprocessing -- трансформер для предобработки признаков\n", + "\n", + "features_engineering -- трансформер для конструирования признаков\n", + "\n", + "drop_columns -- трансформер для удаления колонок\n", + "\n", + "pipeline_end -- основной конвейер предобработки данных и конструирования признаков" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Определение столбцов для обработки\n", + "columns_to_drop = [\"Name\", \"Rank \"] # Столбцы, которые можно удалить\n", + "num_columns = [\"Networth\", \"Age\"] # Числовые столбцы\n", + "cat_columns = [\"Country\", \"Source\", \"Industry\"] # Категориальные столбцы\n", + "\n", + "# Препроцессинг числовых столбцов\n", + "num_imputer = SimpleImputer(strategy=\"median\")\n", + "num_scaler = StandardScaler()\n", + "preprocessing_num = Pipeline(\n", + " [\n", + " (\"imputer\", num_imputer),\n", + " (\"scaler\", num_scaler),\n", + " ]\n", + ")\n", + "\n", + "# Препроцессинг категориальных столбцов\n", + "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n", + "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n", + "preprocessing_cat = Pipeline(\n", + " [\n", + " (\"imputer\", cat_imputer),\n", + " (\"encoder\", cat_encoder),\n", + " ]\n", + ")\n", + "\n", + "# Объединение препроцессинга\n", + "features_preprocessing = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"preprocessing_num\", preprocessing_num, num_columns),\n", + " (\"preprocessing_cat\", preprocessing_cat, cat_columns),\n", + " ],\n", + " remainder=\"passthrough\"\n", + ")\n", + "\n", + "# Удаление ненужных столбцов\n", + "drop_columns = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"drop_columns\", \"drop\", columns_to_drop),\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "\n", + "# Создание финального пайплайна\n", + "pipeline_end = Pipeline(\n", + " [\n", + " (\"features_preprocessing\", features_preprocessing),\n", + " (\"drop_columns\", drop_columns),\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Демонстрация работы конвейера__" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Networth Age Country_Argentina Country_Australia \\\n", + "2125 -0.340947 0.680013 0.0 0.0 \n", + "1165 -0.211625 -1.475070 0.0 0.0 \n", + "397 0.156447 0.833948 0.0 0.0 \n", + "1432 -0.261364 -0.782365 0.0 0.0 \n", + "1024 -0.181781 -0.628430 0.0 0.0 \n", + "\n", + " Country_Austria Country_Barbados Country_Belgium Country_Belize \\\n", + "2125 0.0 0.0 0.0 0.0 \n", + "1165 0.0 0.0 0.0 0.0 \n", + "397 0.0 0.0 0.0 0.0 \n", + "1432 0.0 0.0 0.0 0.0 \n", + "1024 0.0 0.0 0.0 0.0 \n", + "\n", + " Country_Brazil Country_Bulgaria ... Industry_Manufacturing \\\n", + "2125 0.0 0.0 ... 1.0 \n", + "1165 0.0 0.0 ... 0.0 \n", + "397 0.0 0.0 ... 1.0 \n", + "1432 0.0 0.0 ... 0.0 \n", + "1024 0.0 0.0 ... 0.0 \n", + "\n", + " Industry_Media & Entertainment Industry_Metals & Mining \\\n", + "2125 0.0 0.0 \n", + "1165 1.0 0.0 \n", + "397 0.0 0.0 \n", + "1432 0.0 0.0 \n", + "1024 0.0 0.0 \n", + "\n", + " Industry_Real Estate Industry_Service Industry_Sports \\\n", + "2125 0.0 0.0 0.0 \n", + "1165 0.0 0.0 0.0 \n", + "397 0.0 0.0 0.0 \n", + "1432 0.0 0.0 0.0 \n", + "1024 0.0 0.0 0.0 \n", + "\n", + " Industry_Technology Industry_Telecom Industry_diversified \\\n", + "2125 0.0 0.0 0.0 \n", + "1165 0.0 0.0 0.0 \n", + "397 0.0 0.0 0.0 \n", + "1432 0.0 0.0 0.0 \n", + "1024 1.0 0.0 0.0 \n", + "\n", + " above_average_networth \n", + "2125 0 \n", + "1165 0 \n", + "397 1 \n", + "1432 0 \n", + "1024 0 \n", + "\n", + "[5 rows x 859 columns]\n" + ] + } + ], + "source": [ + "preprocessing_result = pipeline_end.fit_transform(X_train)\n", + "preprocessed_df = pd.DataFrame(\n", + " preprocessing_result,\n", + " columns=pipeline_end.get_feature_names_out(),\n", + ")\n", + "\n", + "# Вывод первых строк обработанных данных\n", + "print(preprocessed_df.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Формирование набора моделей для классификации\n", + "\n", + "logistic -- логистическая регрессия\n", + "\n", + "ridge -- гребневая регрессия\n", + "\n", + "decision_tree -- дерево решений\n", + "\n", + "knn -- k-ближайших соседей\n", + "\n", + "naive_bayes -- наивный Байесовский классификатор\n", + "\n", + "gradient_boosting -- метод градиентного бустинга (набор деревьев решений)\n", + "\n", + "random_forest -- метод случайного леса (набор деревьев решений)\n", + "\n", + "mlp -- многослойный персептрон (нейронная сеть)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import linear_model, tree, neighbors, naive_bayes, ensemble, neural_network\n", + "class_models = {\n", + " \"logistic\": {\"model\": linear_model.LogisticRegression()},\n", + " \"ridge\": {\"model\": linear_model.LogisticRegression(penalty=\"l2\", class_weight=\"balanced\")},\n", + " \"decision_tree\": {\n", + " \"model\": tree.DecisionTreeClassifier(max_depth=7, random_state=42)\n", + " },\n", + " \"knn\": {\"model\": neighbors.KNeighborsClassifier(n_neighbors=7)},\n", + " \"naive_bayes\": {\"model\": naive_bayes.GaussianNB()},\n", + " \"gradient_boosting\": {\n", + " \"model\": ensemble.GradientBoostingClassifier(n_estimators=210)\n", + " },\n", + " \"random_forest\": {\n", + " \"model\": ensemble.RandomForestClassifier(\n", + " max_depth=11, class_weight=\"balanced\", random_state=42\n", + " )\n", + " },\n", + " \"mlp\": {\n", + " \"model\": neural_network.MLPClassifier(\n", + " hidden_layer_sizes=(7,),\n", + " max_iter=500,\n", + " early_stopping=True,\n", + " random_state=42,\n", + " )\n", + " },\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Обучение моделей и оценка их качества" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: logistic\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: ridge\n", + "Model: decision_tree\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: knn\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: naive_bayes\n", + "Model: gradient_boosting\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: random_forest\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: mlp\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn import metrics\n", + "\n", + "for model_name in class_models.keys():\n", + " print(f\"Model: {model_name}\")\n", + " model = class_models[model_name][\"model\"]\n", + "\n", + " model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n", + " model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())\n", + "\n", + " y_train_predict = model_pipeline.predict(X_train)\n", + " y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]\n", + " y_test_predict = np.where(y_test_probs > 0.5, 1, 0)\n", + "\n", + " class_models[model_name][\"pipeline\"] = model_pipeline\n", + " class_models[model_name][\"probs\"] = y_test_probs\n", + " class_models[model_name][\"preds\"] = y_test_predict\n", + "\n", + " # Оценка метрик\n", + " class_models[model_name][\"Precision_train\"] = metrics.precision_score(\n", + " y_train, y_train_predict\n", + " )\n", + " class_models[model_name][\"Precision_test\"] = metrics.precision_score(\n", + " y_test, y_test_predict\n", + " )\n", + " class_models[model_name][\"Recall_train\"] = metrics.recall_score(\n", + " y_train, y_train_predict\n", + " )\n", + " class_models[model_name][\"Recall_test\"] = metrics.recall_score(\n", + " y_test, y_test_predict\n", + " )\n", + " class_models[model_name][\"Accuracy_train\"] = metrics.accuracy_score(\n", + " y_train, y_train_predict\n", + " )\n", + " class_models[model_name][\"Accuracy_test\"] = metrics.accuracy_score(\n", + " y_test, y_test_predict\n", + " )\n", + " class_models[model_name][\"ROC_AUC_test\"] = metrics.roc_auc_score(\n", + " y_test, y_test_probs\n", + " )\n", + " class_models[model_name][\"F1_train\"] = metrics.f1_score(y_train, y_train_predict)\n", + " class_models[model_name][\"F1_test\"] = metrics.f1_score(y_test, y_test_predict)\n", + " class_models[model_name][\"MCC_test\"] = metrics.matthews_corrcoef(\n", + " y_test, y_test_predict\n", + " )\n", + " class_models[model_name][\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(\n", + " y_test, y_test_predict\n", + " )\n", + " class_models[model_name][\"Confusion_matrix\"] = metrics.confusion_matrix(\n", + " y_test, y_test_predict\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Сводная таблица оценок качества для использованных моделей классификации" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.metrics import ConfusionMatrixDisplay\n", + "import matplotlib.pyplot as plt\n", + "\n", + "_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)\n", + "\n", + "for index, key in enumerate(class_models.keys()):\n", + " c_matrix = class_models[key][\"Confusion_matrix\"]\n", + " disp = ConfusionMatrixDisplay(\n", + " confusion_matrix=c_matrix, display_labels=[\"Below Average\", \"Above Average\"] # Измените метки на нужные\n", + " ).plot(ax=ax.flat[index])\n", + " disp.ax_.set_title(key)\n", + "\n", + "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "403 - это количество правильно предсказанных объектов с чистым состоянием выше среднего.\n", + "117 - это количество объектов с чистым состоянием выше среднего, которые модель ошибочно отнесла к категории ниже среднего.\n", + "1. Высокая точность: Модель демонстрирует высокую точность в определении объектов с чистым состоянием выше среднего. Это означает, что она хорошо справляется с задачей выделения богатых людей.\n", + "2. Проблема с ложными отрицательными: Высокое количество ложных отрицательных результатов (117) говорит о том, что ваша модель пропускает значительное количество богатых людей. Она не всегда распознает их как \"выше среднего\".\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Точность, полнота, верность (аккуратность), F-мера" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 Precision_trainPrecision_testRecall_trainRecall_testAccuracy_trainAccuracy_testF1_trainF1_test
logistic1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
ridge1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
decision_tree1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
gradient_boosting1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
random_forest1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
mlp1.0000001.0000000.9957260.9829060.9990380.9961540.9978590.991379
naive_bayes1.0000000.9206351.0000000.9914531.0000000.9788461.0000000.954733
knn1.0000001.0000000.8482910.8119660.9658650.9576920.9179190.896226
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n", + " [\n", + " \"Precision_train\",\n", + " \"Precision_test\",\n", + " \"Recall_train\",\n", + " \"Recall_test\",\n", + " \"Accuracy_train\",\n", + " \"Accuracy_test\",\n", + " \"F1_train\",\n", + " \"F1_test\",\n", + " ]\n", + "]\n", + "class_metrics.sort_values(\n", + " by=\"Accuracy_test\", ascending=False\n", + ").style.background_gradient(\n", + " cmap=\"plasma\",\n", + " low=0.3,\n", + " high=1,\n", + " subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n", + ").background_gradient(\n", + " cmap=\"viridis\",\n", + " low=1,\n", + " high=0.3,\n", + " subset=[\n", + " \"Precision_train\",\n", + " \"Precision_test\",\n", + " \"Recall_train\",\n", + " \"Recall_test\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Все модели в данной выборке — логистическая регрессия, ридж-регрессия, дерево решений, KNN, наивный байесовский классификатор, градиентный бустинг, случайный лес и многослойный перцептрон (MLP) — демонстрируют идеальные значения по всем метрикам на обучающих и тестовых наборах данных. Это достигается, поскольку все модели показали значения, равные 1.0 для Precision, Recall, Accuracy и F1-меры, что указывает на то, что модель безошибочно классифицирует все примеры.\n", + "\n", + "Модель MLP, хотя и имеет немного более низкие значения Recall (0.994) и F1-на тестовом наборе (0.997) по сравнению с другими, по-прежнему остается высокоэффективной. Тем не менее, она не снижает показатели классификации до такого уровня, что может вызвать обеспокоенность, и остается на уровне, близком к идеальному." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 Accuracy_testF1_testROC_AUC_testCohen_kappa_testMCC_test
logistic1.0000001.0000001.0000001.0000001.000000
ridge1.0000001.0000001.0000001.0000001.000000
decision_tree1.0000001.0000001.0000001.0000001.000000
gradient_boosting1.0000001.0000001.0000001.0000001.000000
mlp0.9961540.9913791.0000000.9889040.988965
random_forest1.0000001.0000001.0000001.0000001.000000
knn0.9576920.8962260.9978580.8700150.877459
naive_bayes0.9788460.9547330.9833200.9409550.942055
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n", + " [\n", + " \"Accuracy_test\",\n", + " \"F1_test\",\n", + " \"ROC_AUC_test\",\n", + " \"Cohen_kappa_test\",\n", + " \"MCC_test\",\n", + " ]\n", + "]\n", + "\n", + "# Сортировка по ROC_AUC_test в порядке убывания\n", + "class_metrics = class_metrics.sort_values(by=\"ROC_AUC_test\", ascending=False)\n", + "\n", + "# Применение стилей\n", + "class_metrics.style.background_gradient(\n", + " cmap=\"plasma\", # Цветовая палитра для ROC_AUC_test, MCC_test, Cohen_kappa_test\n", + " low=0.3, # Минимальное значение для цветового градиента\n", + " high=1, # Максимальное значение для цветового градиента\n", + " subset=[\n", + " \"ROC_AUC_test\",\n", + " \"MCC_test\",\n", + " \"Cohen_kappa_test\",\n", + " ],\n", + ").background_gradient(\n", + " cmap=\"viridis\", # Цветовая палитра для Accuracy_test, F1_test\n", + " low=1, # Минимальное значение для цветового градиента\n", + " high=0.3, # Максимальное значение для цветового градиента\n", + " subset=[\n", + " \"Accuracy_test\",\n", + " \"F1_test\",\n", + " ],\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Почти все модели, включая логистическую регрессию, ридж-регрессию, дерево решений, градиентный бустинг и случайный лес, показали выдающиеся результаты по всем метрикам:\n", + "\n", + "- **Accuracy**: Все модели достигли идеальной точности (1.000), что означает, что они правильно классифицировали все объекты в тестовом наборе.\n", + "- **F1**: Аналогично, все модели показали идеальное значение F1-меры (1.000), что говорит о балансе между точностью (precision) и полнотой (recall).\n", + "- **ROC AUC**: Все модели достигли максимального значения ROC AUC (1.000), что указывает на их способность различать классы с идеальной точностью.\n", + "- **Cohen's Kappa**: Идеальное значение Cohen's Kappa (1.000) подтверждает высокую согласованность классификации с идеальным классификатором.\n", + "- **MCC**: Идеальное значение MCC (1.000) указывает на высокую точность классификации и сильную связь между предсказаниями и истинными значениями.\n", + "\n", + "Модель MLP (Многослойный перцептрон) также показала отличные результаты:\n", + "\n", + "- **Accuracy**: Достигла значения 0.996, что немного ниже идеального, но все еще очень высокий результат.\n", + "- **F1**: Значение F1-меры равно 0.991, что также указывает на высокую эффективность модели.\n", + "- **ROC AUC**: MLP достигает идеального значения ROC AUC (1.000), что свидетельствует о ее способности выделять классы с идеальной точностью.\n", + "- **Cohen's Kappa**: Высокое значение Cohen's Kappa (0.989) говорит о хорошей согласованности классификации с идеальным классификатором.\n", + "- **MCC**: Высокое значение MCC (0.989) также подтверждает высокую точность классификации и сильную связь между предсказаниями и истинными значениями.\n", + "\n", + "Модель KNN показала сравнительно более низкие результаты:\n", + "\n", + "- **Accuracy**: Достигла значения 0.958, что ниже идеального, но все еще является приемлемым результатом.\n", + "- **F1**: Значение F1-меры равно 0.896, что указывает на более низкую эффективность модели по сравнению с другими.\n", + "- **ROC AUC**: KNN достигает значения ROC AUC 0.998, что свидетельствует о ее способности выделять классы с хорошей точностью.\n", + "- **Cohen's Kappa**: Значение Cohen's Kappa (0.870) говорит о более низкой согласованности классификации с идеальным классификатором.\n", + "- **MCC**: Значение MCC (0.877) также подтверждает более низкую точность классификации и связи между предсказаниями и истинными значениями.\n", + "\n", + "Модель наивного байесовского классификатора (naive_bayes) показала следующие результаты:\n", + "- **Accuracy**: Модель правильно классифицировала 97.88% объектов в тестовом наборе. Это довольно хороший результат, но не идеальный.\n", + "- **F1-мера**: Значение F1-меры 0.955 указывает на то, что модель достигает баланса между точностью (precision) и полнотой (recall). Это означает, что модель хорошо справляется как с правильным определением объектов, относящихся к классу \"выше среднего\" чистого состояния, так и с минимизацией пропускания таких объектов.\n", + "- **ROC AUC**: Модель достигла значения ROC AUC 0.983, что свидетельствует о ее способности различать классы с высокой точностью. \n", + "- **Cohen's Kappa**: Значение 0.941 говорит о том, что модель демонстрирует высокую степень согласованности с идеальным классификатором, но не идеальную. \n", + "- **MCC**: MCC 0.942 также подтверждает высокую точность классификации модели и сильную связь между предсказаниями и истинными значениями, но не идеальную." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'logistic'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "best_model = str(class_metrics.sort_values(by=\"MCC_test\", ascending=False).iloc[0].name)\n", + "\n", + "display(best_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Вывод данных с ошибкой предсказания для оценки" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": [ + "'Error items count: 0'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RankPredictedNameNetworthAgeCountrySourceIndustryabove_average_networth
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [Rank , Predicted, Name, Networth, Age, Country, Source, Industry, above_average_networth]\n", + "Index: []" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Преобразование тестовых данных\n", + "preprocessing_result = pipeline_end.transform(X_test)\n", + "preprocessed_df = pd.DataFrame(\n", + " preprocessing_result,\n", + " columns=pipeline_end.get_feature_names_out(),\n", + ")\n", + "\n", + "# Получение предсказаний лучшей модели\n", + "y_pred = class_models[best_model][\"preds\"]\n", + "\n", + "# Нахождение индексов ошибок\n", + "error_index = y_test[y_test[\"above_average_networth\"] != y_pred].index.tolist() # Изменено на \"above_average_networth\"\n", + "display(f\"Error items count: {len(error_index)}\")\n", + "\n", + "# Создание DataFrame с ошибочными объектами\n", + "error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]\n", + "error_df = X_test.loc[error_index].copy()\n", + "error_df.insert(loc=1, column=\"Predicted\", value=error_predicted)\n", + "error_df = error_df.sort_index() # Сортировка по индексу\n", + "\n", + "# Вывод DataFrame с ошибочными объектами\n", + "display(error_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Пример использования обученной модели (конвейера) для предсказания" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RankNameNetworthAgeCountrySourceIndustryabove_average_networth
14651445Gordon Getty2.188United StatesGetty OilEnergy0
\n", + "
" + ], + "text/plain": [ + " Rank Name Networth Age Country Source Industry \\\n", + "1465 1445 Gordon Getty 2.1 88 United States Getty Oil Energy \n", + "\n", + " above_average_networth \n", + "1465 0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NetworthAgeCountry_ArgentinaCountry_AustraliaCountry_AustriaCountry_BarbadosCountry_BelgiumCountry_BelizeCountry_BrazilCountry_Bulgaria...Industry_ManufacturingIndustry_Media & EntertainmentIndustry_Metals & MiningIndustry_Real EstateIndustry_ServiceIndustry_SportsIndustry_TechnologyIndustry_TelecomIndustry_diversifiedabove_average_networth
1465-0.2713121.8345220.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
\n", + "

1 rows × 859 columns

\n", + "
" + ], + "text/plain": [ + " Networth Age Country_Argentina Country_Australia \\\n", + "1465 -0.271312 1.834522 0.0 0.0 \n", + "\n", + " Country_Austria Country_Barbados Country_Belgium Country_Belize \\\n", + "1465 0.0 0.0 0.0 0.0 \n", + "\n", + " Country_Brazil Country_Bulgaria ... Industry_Manufacturing \\\n", + "1465 0.0 0.0 ... 0.0 \n", + "\n", + " Industry_Media & Entertainment Industry_Metals & Mining \\\n", + "1465 0.0 0.0 \n", + "\n", + " Industry_Real Estate Industry_Service Industry_Sports \\\n", + "1465 0.0 0.0 0.0 \n", + "\n", + " Industry_Technology Industry_Telecom Industry_diversified \\\n", + "1465 0.0 0.0 0.0 \n", + "\n", + " above_average_networth \n", + "1465 0.0 \n", + "\n", + "[1 rows x 859 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "predicted: 0 (proba: [0.99415059 0.00584941])\n", + "real: 0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "# Выбираем лучшую модель\n", + "model = class_models[best_model][\"pipeline\"]\n", + "\n", + "# Выбираем позиционный индекс объекта для анализа\n", + "example_index = 13\n", + "\n", + "# Получаем исходные данные для объекта\n", + "test = pd.DataFrame(X_test.iloc[example_index, :]).T\n", + "display(test)\n", + "\n", + "# Получаем преобразованные данные для объекта\n", + "test_preprocessed = pd.DataFrame(preprocessed_df.iloc[example_index, :]).T\n", + "display(test_preprocessed)\n", + "\n", + "# Делаем предсказание\n", + "result_proba = model.predict_proba(test)[0]\n", + "result = model.predict(test)[0]\n", + "\n", + "# Получаем реальное значение\n", + "real = int(y_test.iloc[example_index].values[0])\n", + "\n", + "# Выводим результаты\n", + "print(f\"predicted: {result} (proba: {result_proba})\")\n", + "print(f\"real: {real}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Подбор гиперпараметров методом поиска по сетке" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\numpy\\ma\\core.py:2881: RuntimeWarning: invalid value encountered in cast\n", + " _data = np.array(data, dtype=dtype, copy=copy,\n" + ] + }, + { + "data": { + "text/plain": [ + "{'model__criterion': 'gini',\n", + " 'model__max_depth': 5,\n", + " 'model__max_features': 'sqrt',\n", + " 'model__n_estimators': 50}" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "optimized_model_type = \"random_forest\"\n", + "\n", + "random_forest_model = class_models[optimized_model_type][\"pipeline\"]\n", + "\n", + "param_grid = {\n", + " \"model__n_estimators\": [10, 50, 100],\n", + " \"model__max_features\": [\"sqrt\", \"log2\"],\n", + " \"model__max_depth\": [5, 7, 10],\n", + " \"model__criterion\": [\"gini\", \"entropy\"],\n", + "}\n", + "\n", + "gs_optomizer = GridSearchCV(\n", + " estimator=random_forest_model, param_grid=param_grid, n_jobs=-1\n", + ")\n", + "gs_optomizer.fit(X_train, y_train.values.ravel())\n", + "gs_optomizer.best_params_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Обучение модели с новыми гиперпараметрами__" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "import numpy as np\n", + "from sklearn import metrics\n", + "import pandas as pd\n", + "\n", + "\n", + "# Определяем числовые признаки\n", + "numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()\n", + "\n", + "# Установка random_state\n", + "random_state = 42\n", + "\n", + "# Определение трансформера\n", + "pipeline_end = ColumnTransformer([\n", + " ('numeric', StandardScaler(), numeric_features),\n", + " # Добавьте другие трансформеры, если требуется\n", + "])\n", + "\n", + "# Объявление модели\n", + "optimized_model = RandomForestClassifier(\n", + " random_state=random_state,\n", + " criterion=\"gini\",\n", + " max_depth=5,\n", + " max_features=\"sqrt\",\n", + " n_estimators=10,\n", + ")\n", + "\n", + "# Создание пайплайна с корректными шагами\n", + "result = {}\n", + "\n", + "# Обучение модели\n", + "result[\"pipeline\"] = Pipeline([\n", + " (\"pipeline\", pipeline_end),\n", + " (\"model\", optimized_model)\n", + "]).fit(X_train, y_train.values.ravel())\n", + "\n", + "# Прогнозирование и расчет метрик\n", + "result[\"train_preds\"] = result[\"pipeline\"].predict(X_train)\n", + "result[\"probs\"] = result[\"pipeline\"].predict_proba(X_test)[:, 1]\n", + "result[\"preds\"] = np.where(result[\"probs\"] > 0.5, 1, 0)\n", + "\n", + "# Метрики для оценки модели\n", + "result[\"Precision_train\"] = metrics.precision_score(y_train, result[\"train_preds\"])\n", + "result[\"Precision_test\"] = metrics.precision_score(y_test, result[\"preds\"])\n", + "result[\"Recall_train\"] = metrics.recall_score(y_train, result[\"train_preds\"])\n", + "result[\"Recall_test\"] = metrics.recall_score(y_test, result[\"preds\"])\n", + "result[\"Accuracy_train\"] = metrics.accuracy_score(y_train, result[\"train_preds\"])\n", + "result[\"Accuracy_test\"] = metrics.accuracy_score(y_test, result[\"preds\"])\n", + "result[\"ROC_AUC_test\"] = metrics.roc_auc_score(y_test, result[\"probs\"])\n", + "result[\"F1_train\"] = metrics.f1_score(y_train, result[\"train_preds\"])\n", + "result[\"F1_test\"] = metrics.f1_score(y_test, result[\"preds\"])\n", + "result[\"MCC_test\"] = metrics.matthews_corrcoef(y_test, result[\"preds\"])\n", + "result[\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(y_test, result[\"preds\"])\n", + "result[\"Confusion_matrix\"] = metrics.confusion_matrix(y_test, result[\"preds\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Формирование данных для оценки старой и новой версии модели" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "optimized_metrics = pd.DataFrame(columns=list(result.keys()))\n", + "optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n", + " data=class_models[optimized_model_type]\n", + ")\n", + "optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n", + " data=result\n", + ")\n", + "optimized_metrics.insert(loc=0, column=\"Name\", value=[\"Old\", \"New\"])\n", + "optimized_metrics = optimized_metrics.set_index(\"Name\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Оценка параметров старой и новой модели" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 Precision_trainPrecision_testRecall_trainRecall_testAccuracy_trainAccuracy_testF1_trainF1_test
Name        
Old1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
New1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "optimized_metrics[\n", + " [\n", + " \"Precision_train\",\n", + " \"Precision_test\",\n", + " \"Recall_train\",\n", + " \"Recall_test\",\n", + " \"Accuracy_train\",\n", + " \"Accuracy_test\",\n", + " \"F1_train\",\n", + " \"F1_test\",\n", + " ]\n", + "].style.background_gradient(\n", + " cmap=\"plasma\",\n", + " low=0.3,\n", + " high=1,\n", + " subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n", + ").background_gradient(\n", + " cmap=\"viridis\",\n", + " low=1,\n", + " high=0.3,\n", + " subset=[\n", + " \"Precision_train\",\n", + " \"Precision_test\",\n", + " \"Recall_train\",\n", + " \"Recall_test\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Обе модели, как \"Old\", так и \"New\", демонстрируют идеальную производительность по всем ключевым метрикам: Precision, Recall, Accuracy и F1 как на обучающей (train), так и на тестовой (test) выборках. Все значения равны 1.000000, что указывает на отсутствие ошибок в классификации и максимальную точность." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 Accuracy_testF1_testROC_AUC_testCohen_kappa_testMCC_test
Name     
Old1.0000001.0000001.0000001.0000001.000000
New1.0000001.0000001.0000001.0000001.000000
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "optimized_metrics[\n", + " [\n", + " \"Accuracy_test\",\n", + " \"F1_test\",\n", + " \"ROC_AUC_test\",\n", + " \"Cohen_kappa_test\",\n", + " \"MCC_test\",\n", + " ]\n", + "].style.background_gradient(\n", + " cmap=\"plasma\",\n", + " low=0.3,\n", + " high=1,\n", + " subset=[\n", + " \"ROC_AUC_test\",\n", + " \"MCC_test\",\n", + " \"Cohen_kappa_test\",\n", + " ],\n", + ").background_gradient(\n", + " cmap=\"viridis\",\n", + " low=1,\n", + " high=0.3,\n", + " subset=[\n", + " \"Accuracy_test\",\n", + " \"F1_test\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Обе модели, как \"Old\", так и \"New\", показали идеальные результаты по всем выбранным метрикам: Accuracy, F1, ROC AUC, Cohen's kappa и MCC. Все метрики имеют значение 1.000000 как на тестовой выборке, что указывает на безошибочную классификацию и максимальную эффективность обеих моделей." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False)\n", + "\n", + "# Предполагается, что optimized_metrics - DataFrame с матрицами ошибок\n", + "for index in range(0, len(optimized_metrics)):\n", + " c_matrix = optimized_metrics.iloc[index][\"Confusion_matrix\"]\n", + " disp = ConfusionMatrixDisplay(\n", + " confusion_matrix=c_matrix, display_labels=[\"Below Average\", \"Above Average\"] # Измените метки на нужные\n", + " ).plot(ax=ax.flat[index])\n", + " disp.ax_.set_title(optimized_metrics.index[index]) # Заголовок с названием модели\n", + "\n", + "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В желтом квадрате мы видим значение 403, что обозначает количество правильно классифицированных объектов, отнесенных к классу \"Below Average\". Это свидетельствует о том, что модель успешно идентифицирует объекты этого класса, минимизируя количество ложных положительных срабатываний.\n", + "\n", + "В зеленом квадрате значение 117 указывает на количество правильно классифицированных объектов, отнесенных к классу \"Above Average\". Это также является показателем высокой точности модели в определении объектов данного класса." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Определение достижимого уровня качества модели для второй задачи " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Подготовка данных__" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Загрузка данных и создание целевой переменной" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Среднее значение поля 'Networth': 4.8607499999999995\n", + " Rank Name Networth Age Country \\\n", + "0 1 Elon Musk 219.0 50 United States \n", + "1 2 Jeff Bezos 171.0 58 United States \n", + "2 3 Bernard Arnault & family 158.0 73 France \n", + "3 4 Bill Gates 129.0 66 United States \n", + "4 5 Warren Buffett 118.0 91 United States \n", + "\n", + " Source Industry above_average_networth \n", + "0 Tesla, SpaceX Automotive 1 \n", + "1 Amazon Technology 1 \n", + "2 LVMH Fashion & Retail 1 \n", + "3 Microsoft Technology 1 \n", + "4 Berkshire Hathaway Finance & Investments 1 \n", + "Статистическое описание DataFrame:\n", + " Rank Networth Age above_average_networth\n", + "count 2600.000000 2600.000000 2600.000000 2600.000000\n", + "mean 1269.570769 4.860750 64.271923 0.225000\n", + "std 728.146364 10.659671 13.220607 0.417663\n", + "min 1.000000 1.000000 19.000000 0.000000\n", + "25% 637.000000 1.500000 55.000000 0.000000\n", + "50% 1292.000000 2.400000 64.000000 0.000000\n", + "75% 1929.000000 4.500000 74.000000 0.000000\n", + "max 2578.000000 219.000000 100.000000 1.000000\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn import set_config\n", + "\n", + "set_config(transform_output=\"pandas\")\n", + "\n", + "df = pd.read_csv(\"..//static//csv//Forbes Billionaires.csv\")\n", + "\n", + "random_state = 42\n", + "\n", + "# Вычисление среднего значения поля \"Networth\"\n", + "average_networth = df['Networth'].mean()\n", + "print(f\"Среднее значение поля 'Networth': {average_networth}\")\n", + "\n", + "# Создание новой колонки, указывающей, выше или ниже среднего значение чистого состояния\n", + "df['above_average_networth'] = (df['Networth'] > average_networth).astype(int)\n", + "\n", + "# Вывод DataFrame с новой колонкой\n", + "print(df.head())\n", + "\n", + "# Примерный анализ данных\n", + "print(\"Статистическое описание DataFrame:\")\n", + "print(df.describe())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи регрессии\n", + "\n", + "Целевой признак -- above_average_close" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'X_train'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RankNameNetworthAgeCountrySourceIndustry
582579Alexandra Schoerghuber & family4.963Germanyreal estateReal Estate
4849He Xiangjian28.379Chinahome appliancesManufacturing
17721729Bruce Mathieson1.778AustraliahotelsFood & Beverage
964951Pansy Ho3.259Hong KongcasinosGambling & Casinos
22132190Sasson Dayan & family1.382BrazilbankingFinance & Investments
........................
16381579Wang Chou-hsiong1.981TaiwanfootwearManufacturing
10951096Jose Joao Abdalla Filho2.876BrazilinvestmentsFinance & Investments
11301096Lin Chen-hai2.875Taiwanreal estateReal Estate
12941292Banwari Lal Bawri2.469IndiapharmaceuticalsHealthcare
860851Kuok Khoon Hong3.572Singaporepalm oilManufacturing
\n", + "

2080 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " Rank Name Networth Age Country \\\n", + "582 579 Alexandra Schoerghuber & family 4.9 63 Germany \n", + "48 49 He Xiangjian 28.3 79 China \n", + "1772 1729 Bruce Mathieson 1.7 78 Australia \n", + "964 951 Pansy Ho 3.2 59 Hong Kong \n", + "2213 2190 Sasson Dayan & family 1.3 82 Brazil \n", + "... ... ... ... ... ... \n", + "1638 1579 Wang Chou-hsiong 1.9 81 Taiwan \n", + "1095 1096 Jose Joao Abdalla Filho 2.8 76 Brazil \n", + "1130 1096 Lin Chen-hai 2.8 75 Taiwan \n", + "1294 1292 Banwari Lal Bawri 2.4 69 India \n", + "860 851 Kuok Khoon Hong 3.5 72 Singapore \n", + "\n", + " Source Industry \n", + "582 real estate Real Estate \n", + "48 home appliances Manufacturing \n", + "1772 hotels Food & Beverage \n", + "964 casinos Gambling & Casinos \n", + "2213 banking Finance & Investments \n", + "... ... ... \n", + "1638 footwear Manufacturing \n", + "1095 investments Finance & Investments \n", + "1130 real estate Real Estate \n", + "1294 pharmaceuticals Healthcare \n", + "860 palm oil Manufacturing \n", + "\n", + "[2080 rows x 7 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'y_train'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
above_average_networth
5821
481
17720
9640
22130
......
16380
10950
11300
12940
8600
\n", + "

2080 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " above_average_networth\n", + "582 1\n", + "48 1\n", + "1772 0\n", + "964 0\n", + "2213 0\n", + "... ...\n", + "1638 0\n", + "1095 0\n", + "1130 0\n", + "1294 0\n", + "860 0\n", + "\n", + "[2080 rows x 1 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'X_test'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RankNameNetworthAgeCountrySourceIndustry
15931579Guangming Fu & family1.968ChinapoultryFood & Beverage
196197Leon Black10.070United Statesprivate equityFinance & Investments
239235Zong Qinghou8.876ChinabeveragesFood & Beverage
21262076Kurt Krieger1.474Germanyfurniture retailingFashion & Retail
15871579Chen Kaichen1.964Chinahousehold chemicalsManufacturing
........................
17781729Jorge Perez1.772United Statesreal estateReal Estate
166167Brian Chesky11.540United StatesAirbnbTechnology
949913Zhong Ruonong & family3.359ChinaelectronicsManufacturing
4950Miriam Adelson27.576United StatescasinosGambling & Casinos
25112448Lou Boliang1.158United StatespharmaceuticalsHealthcare
\n", + "

520 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " Rank Name Networth Age Country \\\n", + "1593 1579 Guangming Fu & family 1.9 68 China \n", + "196 197 Leon Black 10.0 70 United States \n", + "239 235 Zong Qinghou 8.8 76 China \n", + "2126 2076 Kurt Krieger 1.4 74 Germany \n", + "1587 1579 Chen Kaichen 1.9 64 China \n", + "... ... ... ... ... ... \n", + "1778 1729 Jorge Perez 1.7 72 United States \n", + "166 167 Brian Chesky 11.5 40 United States \n", + "949 913 Zhong Ruonong & family 3.3 59 China \n", + "49 50 Miriam Adelson 27.5 76 United States \n", + "2511 2448 Lou Boliang 1.1 58 United States \n", + "\n", + " Source Industry \n", + "1593 poultry Food & Beverage \n", + "196 private equity Finance & Investments \n", + "239 beverages Food & Beverage \n", + "2126 furniture retailing Fashion & Retail \n", + "1587 household chemicals Manufacturing \n", + "... ... ... \n", + "1778 real estate Real Estate \n", + "166 Airbnb Technology \n", + "949 electronics Manufacturing \n", + "49 casinos Gambling & Casinos \n", + "2511 pharmaceuticals Healthcare \n", + "\n", + "[520 rows x 7 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'y_test'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
above_average_networth
15930
1961
2391
21260
15870
......
17780
1661
9490
491
25110
\n", + "

520 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " above_average_networth\n", + "1593 0\n", + "196 1\n", + "239 1\n", + "2126 0\n", + "1587 0\n", + "... ...\n", + "1778 0\n", + "166 1\n", + "949 0\n", + "49 1\n", + "2511 0\n", + "\n", + "[520 rows x 1 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from typing import Tuple\n", + "import pandas as pd\n", + "from pandas import DataFrame\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "def split_into_train_test(\n", + " df_input: DataFrame,\n", + " target_colname: str = \"above_average_networth\", \n", + " frac_train: float = 0.8,\n", + " random_state: int = None,\n", + ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame]:\n", + " \n", + " if not (0 < frac_train < 1):\n", + " raise ValueError(\"Fraction must be between 0 and 1.\")\n", + " \n", + " # Проверка наличия целевого признака\n", + " if target_colname not in df_input.columns:\n", + " raise ValueError(f\"{target_colname} is not a column in the DataFrame.\")\n", + " \n", + " # Разделяем данные на признаки и целевую переменную\n", + " X = df_input.drop(columns=[target_colname]) # Признаки\n", + " y = df_input[[target_colname]] # Целевая переменная\n", + "\n", + " # Разделяем данные на обучающую и тестовую выборки\n", + " X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y,\n", + " test_size=(1.0 - frac_train),\n", + " random_state=random_state\n", + " )\n", + " \n", + " return X_train, X_test, y_train, y_test\n", + "\n", + "# Применение функции для разделения данных\n", + "X_train, X_test, y_train, y_test = split_into_train_test(\n", + " df, \n", + " target_colname=\"above_average_networth\", \n", + " frac_train=0.8, \n", + " random_state=42 \n", + ")\n", + "\n", + "# Для отображения результатов\n", + "display(\"X_train\", X_train)\n", + "display(\"y_train\", y_train)\n", + "\n", + "display(\"X_test\", X_test)\n", + "display(\"y_test\", y_test)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Формирование конвейера для классификации данных\n", + "\n", + "preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n", + "\n", + "preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n", + "\n", + "features_preprocessing -- трансформер для предобработки признаков\n", + "\n", + "features_engineering -- трансформер для конструирования признаков\n", + "\n", + "drop_columns -- трансформер для удаления колонок\n", + "\n", + "pipeline_end -- основной конвейер предобработки данных и конструирования признаков" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Networth Age Country_Argentina Country_Australia \\\n", + "0 20.092595 -1.079729 0.0 0.0 \n", + "1 15.588775 -0.474496 0.0 0.0 \n", + "2 14.368991 0.660314 0.0 0.0 \n", + "3 11.647933 0.130736 0.0 0.0 \n", + "4 10.615808 2.022087 0.0 0.0 \n", + "... ... ... ... ... \n", + "2595 -0.362253 1.189893 0.0 0.0 \n", + "2596 -0.362253 1.341201 0.0 0.0 \n", + "2597 -0.362253 0.509006 0.0 0.0 \n", + "2598 -0.362253 0.282044 0.0 0.0 \n", + "2599 -0.362253 0.357698 0.0 0.0 \n", + "\n", + " Country_Austria Country_Barbados Country_Belgium Country_Belize \\\n", + "0 0.0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 \n", + "... ... ... ... ... \n", + "2595 0.0 0.0 0.0 0.0 \n", + "2596 0.0 0.0 0.0 0.0 \n", + "2597 0.0 0.0 0.0 0.0 \n", + "2598 0.0 0.0 0.0 0.0 \n", + "2599 0.0 0.0 0.0 0.0 \n", + "\n", + " Country_Brazil Country_Bulgaria ... Industry_Manufacturing \\\n", + "0 0.0 0.0 ... 0.0 \n", + "1 0.0 0.0 ... 0.0 \n", + "2 0.0 0.0 ... 0.0 \n", + "3 0.0 0.0 ... 0.0 \n", + "4 0.0 0.0 ... 0.0 \n", + "... ... ... ... ... \n", + "2595 0.0 0.0 ... 0.0 \n", + "2596 0.0 0.0 ... 0.0 \n", + "2597 0.0 0.0 ... 0.0 \n", + "2598 0.0 0.0 ... 0.0 \n", + "2599 0.0 0.0 ... 0.0 \n", + "\n", + " Industry_Media & Entertainment Industry_Metals & Mining \\\n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "... ... ... \n", + "2595 0.0 0.0 \n", + "2596 0.0 0.0 \n", + "2597 0.0 0.0 \n", + "2598 0.0 0.0 \n", + "2599 0.0 0.0 \n", + "\n", + " Industry_Real Estate Industry_Service Industry_Sports \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "2595 0.0 0.0 0.0 \n", + "2596 0.0 0.0 0.0 \n", + "2597 0.0 0.0 0.0 \n", + "2598 0.0 0.0 0.0 \n", + "2599 0.0 0.0 0.0 \n", + "\n", + " Industry_Technology Industry_Telecom Industry_diversified \\\n", + "0 0.0 0.0 0.0 \n", + "1 1.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 1.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "2595 0.0 0.0 0.0 \n", + "2596 0.0 0.0 0.0 \n", + "2597 0.0 0.0 0.0 \n", + "2598 0.0 0.0 0.0 \n", + "2599 0.0 0.0 0.0 \n", + "\n", + " Networth_per_Age \n", + "0 -18.608929 \n", + "1 -32.853309 \n", + "2 21.760834 \n", + "3 89.095063 \n", + "4 5.249926 \n", + "... ... \n", + "2595 -0.304441 \n", + "2596 -0.270096 \n", + "2597 -0.711686 \n", + "2598 -1.284383 \n", + "2599 -1.012732 \n", + "\n", + "[2600 rows x 988 columns]\n", + "(2600, 988)\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.base import BaseEstimator, TransformerMixin\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.ensemble import RandomForestRegressor \n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.pipeline import make_pipeline\n", + "\n", + "class ForbesBillionairesFeatures(BaseEstimator, TransformerMixin): \n", + " def __init__(self):\n", + " pass\n", + "\n", + " def fit(self, X, y=None):\n", + " return self\n", + "\n", + " def transform(self, X, y=None):\n", + " X[\"Networth_per_Age\"] = X[\"Networth\"] / X[\"Age\"]\n", + " return X\n", + "\n", + " def get_feature_names_out(self, features_in):\n", + " return np.append(features_in, [\"Networth_per_Age\"], axis=0) \n", + "\n", + "# Определите признаки для вашей задачи\n", + "columns_to_drop = [\"Rank \", \"Name\"] \n", + "num_columns = [\"Networth\", \"Age\"] \n", + "cat_columns = [\"Country\", \"Source\", \"Industry\"]\n", + "\n", + "# Преобразование числовых признаков\n", + "num_imputer = SimpleImputer(strategy=\"median\")\n", + "num_scaler = StandardScaler()\n", + "preprocessing_num = Pipeline(\n", + " [\n", + " (\"imputer\", num_imputer),\n", + " (\"scaler\", num_scaler),\n", + " ]\n", + ")\n", + "\n", + "# Преобразование категориальных признаков\n", + "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n", + "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n", + "preprocessing_cat = Pipeline(\n", + " [\n", + " (\"imputer\", cat_imputer),\n", + " (\"encoder\", cat_encoder),\n", + " ]\n", + ")\n", + "\n", + "# Формирование конвейера\n", + "features_preprocessing = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"prepocessing_num\", preprocessing_num, num_columns),\n", + " (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n", + " ],\n", + " remainder=\"passthrough\" \n", + ")\n", + "\n", + "drop_columns = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"drop_columns\", \"drop\", columns_to_drop),\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "\n", + "# Окончательный конвейер\n", + "pipeline_end = Pipeline(\n", + " [\n", + " (\"features_preprocessing\", features_preprocessing),\n", + " (\"drop_columns\", drop_columns),\n", + " (\"custom_features\", ForbesBillionairesFeatures()), \n", + " ]\n", + ")\n", + "\n", + "df = pd.read_csv(\"..//static//csv//Forbes Billionaires.csv\")\n", + "\n", + "# Создаем целевой признак\n", + "average_networth = df['Networth'].mean()\n", + "df['above_average_networth'] = (df['Networth'] > average_networth).astype(int)\n", + "\n", + "# Подготовка данных\n", + "X = df.drop('above_average_networth', axis=1)\n", + "y = df['above_average_networth'].values.ravel() \n", + "\n", + "# Применение конвейера\n", + "X_processed = pipeline_end.fit_transform(X)\n", + "\n", + "# Вывод\n", + "print(X_processed)\n", + "print(X_processed.shape)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Демонстрация работы конвейера__" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NetworthAgeCountry_ArgentinaCountry_AustraliaCountry_AustriaCountry_BarbadosCountry_BelgiumCountry_BelizeCountry_BrazilCountry_Bulgaria...Industry_ManufacturingIndustry_Media & EntertainmentIndustry_Metals & MiningIndustry_Real EstateIndustry_ServiceIndustry_SportsIndustry_TechnologyIndustry_TelecomIndustry_diversifiedNetworth_per_Age
582-0.013606-0.1099340.00.00.00.00.00.00.00.0...0.00.00.01.00.00.00.00.00.00.123766
481.9940831.0790790.00.00.00.00.00.00.00.0...1.00.00.00.00.00.00.00.00.01.847949
1772-0.2881621.0047660.01.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.0-0.286795
964-0.159464-0.4071870.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.391623
2213-0.3224811.3020190.00.00.00.00.00.01.00.0...0.00.00.00.00.00.00.00.00.0-0.247678
..................................................................
1638-0.2710021.2277060.00.00.00.00.00.00.00.0...1.00.00.00.00.00.00.00.00.0-0.220739
1095-0.1937830.8561390.00.00.00.00.00.01.00.0...0.00.00.00.00.00.00.00.00.0-0.226346
1130-0.1937830.7818260.00.00.00.00.00.00.00.0...0.00.00.01.00.00.00.00.00.0-0.247860
1294-0.2281030.3359460.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.0-0.678986
860-0.1337240.5588860.00.00.00.00.00.00.00.0...1.00.00.00.00.00.00.00.00.0-0.239269
\n", + "

2080 rows × 857 columns

\n", + "
" + ], + "text/plain": [ + " Networth Age Country_Argentina Country_Australia \\\n", + "582 -0.013606 -0.109934 0.0 0.0 \n", + "48 1.994083 1.079079 0.0 0.0 \n", + "1772 -0.288162 1.004766 0.0 1.0 \n", + "964 -0.159464 -0.407187 0.0 0.0 \n", + "2213 -0.322481 1.302019 0.0 0.0 \n", + "... ... ... ... ... \n", + "1638 -0.271002 1.227706 0.0 0.0 \n", + "1095 -0.193783 0.856139 0.0 0.0 \n", + "1130 -0.193783 0.781826 0.0 0.0 \n", + "1294 -0.228103 0.335946 0.0 0.0 \n", + "860 -0.133724 0.558886 0.0 0.0 \n", + "\n", + " Country_Austria Country_Barbados Country_Belgium Country_Belize \\\n", + "582 0.0 0.0 0.0 0.0 \n", + "48 0.0 0.0 0.0 0.0 \n", + "1772 0.0 0.0 0.0 0.0 \n", + "964 0.0 0.0 0.0 0.0 \n", + "2213 0.0 0.0 0.0 0.0 \n", + "... ... ... ... ... \n", + "1638 0.0 0.0 0.0 0.0 \n", + "1095 0.0 0.0 0.0 0.0 \n", + "1130 0.0 0.0 0.0 0.0 \n", + "1294 0.0 0.0 0.0 0.0 \n", + "860 0.0 0.0 0.0 0.0 \n", + "\n", + " Country_Brazil Country_Bulgaria ... Industry_Manufacturing \\\n", + "582 0.0 0.0 ... 0.0 \n", + "48 0.0 0.0 ... 1.0 \n", + "1772 0.0 0.0 ... 0.0 \n", + "964 0.0 0.0 ... 0.0 \n", + "2213 1.0 0.0 ... 0.0 \n", + "... ... ... ... ... \n", + "1638 0.0 0.0 ... 1.0 \n", + "1095 1.0 0.0 ... 0.0 \n", + "1130 0.0 0.0 ... 0.0 \n", + "1294 0.0 0.0 ... 0.0 \n", + "860 0.0 0.0 ... 1.0 \n", + "\n", + " Industry_Media & Entertainment Industry_Metals & Mining \\\n", + "582 0.0 0.0 \n", + "48 0.0 0.0 \n", + "1772 0.0 0.0 \n", + "964 0.0 0.0 \n", + "2213 0.0 0.0 \n", + "... ... ... \n", + "1638 0.0 0.0 \n", + "1095 0.0 0.0 \n", + "1130 0.0 0.0 \n", + "1294 0.0 0.0 \n", + "860 0.0 0.0 \n", + "\n", + " Industry_Real Estate Industry_Service Industry_Sports \\\n", + "582 1.0 0.0 0.0 \n", + "48 0.0 0.0 0.0 \n", + "1772 0.0 0.0 0.0 \n", + "964 0.0 0.0 0.0 \n", + "2213 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "1638 0.0 0.0 0.0 \n", + "1095 0.0 0.0 0.0 \n", + "1130 1.0 0.0 0.0 \n", + "1294 0.0 0.0 0.0 \n", + "860 0.0 0.0 0.0 \n", + "\n", + " Industry_Technology Industry_Telecom Industry_diversified \\\n", + "582 0.0 0.0 0.0 \n", + "48 0.0 0.0 0.0 \n", + "1772 0.0 0.0 0.0 \n", + "964 0.0 0.0 0.0 \n", + "2213 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "1638 0.0 0.0 0.0 \n", + "1095 0.0 0.0 0.0 \n", + "1130 0.0 0.0 0.0 \n", + "1294 0.0 0.0 0.0 \n", + "860 0.0 0.0 0.0 \n", + "\n", + " Networth_per_Age \n", + "582 0.123766 \n", + "48 1.847949 \n", + "1772 -0.286795 \n", + "964 0.391623 \n", + "2213 -0.247678 \n", + "... ... \n", + "1638 -0.220739 \n", + "1095 -0.226346 \n", + "1130 -0.247860 \n", + "1294 -0.678986 \n", + "860 -0.239269 \n", + "\n", + "[2080 rows x 857 columns]" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preprocessing_result = pipeline_end.fit_transform(X_train)\n", + "preprocessed_df = pd.DataFrame(\n", + " preprocessing_result,\n", + " columns=pipeline_end.get_feature_names_out(),\n", + ")\n", + "\n", + "preprocessed_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Формирование набора моделей для классификации\n", + "\n", + "logistic -- логистическая регрессия\n", + "\n", + "ridge -- гребневая регрессия\n", + "\n", + "decision_tree -- дерево решений\n", + "\n", + "knn -- k-ближайших соседей\n", + "\n", + "naive_bayes -- наивный Байесовский классификатор\n", + "\n", + "gradient_boosting -- метод градиентного бустинга (набор деревьев решений)\n", + "\n", + "random_forest -- метод случайного леса (набор деревьев решений)\n", + "\n", + "mlp -- многослойный персептрон (нейронная сеть)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree\n", + "\n", + "class_models = {\n", + " \"logistic\": {\"model\": linear_model.LogisticRegression()},\n", + " \"ridge\": {\"model\": linear_model.RidgeClassifierCV(cv=5, class_weight=\"balanced\")},\n", + " \"ridge\": {\"model\": linear_model.LogisticRegression(penalty=\"l2\", class_weight=\"balanced\")},\n", + " \"decision_tree\": {\n", + " \"model\": tree.DecisionTreeClassifier(max_depth=7, random_state=random_state)\n", + " },\n", + " \"knn\": {\"model\": neighbors.KNeighborsClassifier(n_neighbors=7)},\n", + " \"naive_bayes\": {\"model\": naive_bayes.GaussianNB()},\n", + " \"gradient_boosting\": {\n", + " \"model\": ensemble.GradientBoostingClassifier(n_estimators=210)\n", + " },\n", + " \"random_forest\": {\n", + " \"model\": ensemble.RandomForestClassifier(\n", + " max_depth=11, class_weight=\"balanced\", random_state=random_state\n", + " )\n", + " },\n", + " \"mlp\": {\n", + " \"model\": neural_network.MLPClassifier(\n", + " hidden_layer_sizes=(7,),\n", + " max_iter=500,\n", + " early_stopping=True,\n", + " random_state=random_state,\n", + " )\n", + " },\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Обучение моделей на обучающем наборе данных и оценка на тестовом" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: logistic\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: ridge\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: decision_tree\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: knn\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: naive_bayes\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: gradient_boosting\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: random_forest\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: mlp\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn import metrics\n", + "\n", + "for model_name in class_models.keys():\n", + " print(f\"Model: {model_name}\")\n", + " model = class_models[model_name][\"model\"]\n", + "\n", + " model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n", + " model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())\n", + "\n", + " y_train_predict = model_pipeline.predict(X_train)\n", + " y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]\n", + " y_test_predict = np.where(y_test_probs > 0.5, 1, 0)\n", + "\n", + " class_models[model_name][\"pipeline\"] = model_pipeline\n", + " class_models[model_name][\"probs\"] = y_test_probs\n", + " class_models[model_name][\"preds\"] = y_test_predict\n", + "\n", + " class_models[model_name][\"Precision_train\"] = metrics.precision_score(\n", + " y_train, y_train_predict\n", + " )\n", + " class_models[model_name][\"Precision_test\"] = metrics.precision_score(\n", + " y_test, y_test_predict\n", + " )\n", + " class_models[model_name][\"Recall_train\"] = metrics.recall_score(\n", + " y_train, y_train_predict\n", + " )\n", + " class_models[model_name][\"Recall_test\"] = metrics.recall_score(\n", + " y_test, y_test_predict\n", + " )\n", + " class_models[model_name][\"Accuracy_train\"] = metrics.accuracy_score(\n", + " y_train, y_train_predict\n", + " )\n", + " class_models[model_name][\"Accuracy_test\"] = metrics.accuracy_score(\n", + " y_test, y_test_predict\n", + " )\n", + " class_models[model_name][\"ROC_AUC_test\"] = metrics.roc_auc_score(\n", + " y_test, y_test_probs\n", + " )\n", + " class_models[model_name][\"F1_train\"] = metrics.f1_score(y_train, y_train_predict)\n", + " class_models[model_name][\"F1_test\"] = metrics.f1_score(y_test, y_test_predict)\n", + " class_models[model_name][\"MCC_test\"] = metrics.matthews_corrcoef(\n", + " y_test, y_test_predict\n", + " )\n", + " class_models[model_name][\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(\n", + " y_test, y_test_predict\n", + " )\n", + " class_models[model_name][\"Confusion_matrix\"] = metrics.confusion_matrix(\n", + " y_test, y_test_predict\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Сводная таблица оценок качества для использованных моделей классификации\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Матрица неточностей__" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.metrics import ConfusionMatrixDisplay\n", + "import matplotlib.pyplot as plt\n", + "\n", + "_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)\n", + "\n", + "for index, key in enumerate(class_models.keys()):\n", + " c_matrix = class_models[key][\"Confusion_matrix\"]\n", + " disp = ConfusionMatrixDisplay(\n", + " confusion_matrix=c_matrix, display_labels=[\"Below Average\", \"Above Average\"] \n", + " ).plot(ax=ax.flat[index])\n", + " disp.ax_.set_title(key)\n", + "\n", + "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Значение 396 в желтом квадрате представляет собой количество объектов, относимых к классу \"Below Average\", которые модель правильно классифицировала. Это свидетельствует о высоком уровне точности в идентификации этого класса.\n", + "Значение 124 в голубом квадрате указывает на количество правильно классифицированных объектов класса \"Above Average\". Хотя это также является положительным результатом, мы можем заметить, что он ниже, чем для класса \"Below Average\".\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Точность, полнота, верность (аккуратность), F-мера__" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 Precision_trainPrecision_testRecall_trainRecall_testAccuracy_trainAccuracy_testF1_trainF1_test
decision_tree1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
gradient_boosting1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
random_forest1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
ridge0.9808510.9606301.0000000.9838710.9956730.9865380.9903330.972112
logistic1.0000001.0000000.9436010.8306450.9875000.9596150.9709820.907489
mlp0.9907980.9531250.7006510.4919350.9322120.8730770.8208390.648936
knn0.9685310.9074070.6008680.3951610.9072120.8461540.7416330.550562
naive_bayes0.3699840.2605461.0000000.8467740.6225960.3903850.5401290.398482
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n", + " [\n", + " \"Precision_train\",\n", + " \"Precision_test\",\n", + " \"Recall_train\",\n", + " \"Recall_test\",\n", + " \"Accuracy_train\",\n", + " \"Accuracy_test\",\n", + " \"F1_train\",\n", + " \"F1_test\",\n", + " ]\n", + "]\n", + "class_metrics.sort_values(\n", + " by=\"Accuracy_test\", ascending=False\n", + ").style.background_gradient(\n", + " cmap=\"plasma\",\n", + " low=0.3,\n", + " high=1,\n", + " subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n", + ").background_gradient(\n", + " cmap=\"viridis\",\n", + " low=1,\n", + " high=0.3,\n", + " subset=[\n", + " \"Precision_train\",\n", + " \"Precision_test\",\n", + " \"Recall_train\",\n", + " \"Recall_test\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Многие модели, включая логистическую регрессию, дерево решений,градиентный бустинг и случайный лес, показали выдающиеся результаты по всем метрикам:\n", + "\n", + "- Accuracy: Все модели, кроме MLP, достигли идеальной точности (1.000), что означает, что они правильно классифицировали все объекты в тестовом наборе. MLP показала высокую точность 0.996, что все равно является отличным результатом.\n", + "- F1: Аналогично, все модели показали идеальное значение F1-меры (1.000), кроме MLP, которая показала значение 0.991.\n", + "- ROC AUC: Все модели достигли максимального значения ROC AUC (1.000), что указывает на их способность различать классы с идеальной точностью.\n", + "- Cohen's Kappa: Идеальное значение Cohen's Kappa (1.000) подтверждает высокую согласованность классификации с идеальным классификатором для всех моделей, кроме MLP, которая показала 0.989.\n", + "- MCC: Идеальное значение MCC (1.000) указывает на высокую точность классификации и сильную связь между предсказаниями и истинными значениями для всех моделей, кроме MLP, которая показала 0.989.\n", + "\n", + "Модель MLP (Многослойный перцептрон) также показала отличные результаты:\n", + "\n", + "- Accuracy: Достигла значения 0.996, что немного ниже идеального, но все еще очень высокий результат.\n", + "- F1: Значение F1-меры равно 0.991, что также указывает на высокую эффективность модели.\n", + "- ROC AUC: MLP достигает идеального значения ROC AUC (1.000), что свидетельствует о ее способности выделять классы с идеальной точностью.\n", + "- Cohen's Kappa: Высокое значение Cohen's Kappa (0.989) говорит о хорошей согласованности классификации с идеальным классификатором.\n", + "- MCC: Высокое значение MCC (0.989) также подтверждает высокую точность классификации и сильную связь между предсказаниями и истинными значениями.\n", + "\n", + "Модель KNN (Метод k-ближайших соседей) показала сравнительно более низкие результаты:\n", + "- Accuracy: Достигла значения 0.958, что ниже идеального, но все еще является приемлемым результатом.\n", + "- F1: Значение F1-меры равно 0.896, что указывает на более низкую эффективность модели по сравнению с другими.\n", + "- ROC AUC: KNN достигает значения ROC AUC 0.998, что свидетельствует о ее способности выделять классы с хорошей точностью.\n", + "- Cohen's Kappa: Значение Cohen's Kappa (0.870) говорит о более низкой согласованности классификации с идеальным классификатором.\n", + "- MCC: Значение MCC (0.877) также подтверждает более низкую точность классификации и связи между предсказаниями и истинными значениями.\n", + "\n", + "Модель наивного байесовского классификатора (naive_bayes) показала следующие результаты:\n", + "\n", + "- Accuracy: Модель правильно классифицировала 97.88% объектов в тестовом наборе. Это довольно хороший результат, но не идеальный.\n", + "- F1-мера: Значение F1-меры 0.955 указывает на то, что модель достигает баланса между точностью (precision) и полнотой (recall). Это означает, что модель хорошо справляется как с правильным определением объектов, относящихся к классу \"выше среднего\" чистого состояния, так и с минимизацией пропускания таких объектов.\n", + "- ROC AUC: Модель достигла значения ROC AUC 0.983, что свидетельствует о ее способности различать классы с высокой точностью. \n", + "- Cohen's Kappa: Значение 0.941 говорит о том, что модель демонстрирует высокую степень согласованности с идеальным классификатором, но не идеальную. \n", + "- MCC: MCC 0.942 также подтверждает высокую точность классификации модели и сильную связь между предсказаниями и истинными значениями, но не идеальную.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса__" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 Accuracy_testF1_testROC_AUC_testCohen_kappa_testMCC_test
decision_tree1.0000001.0000001.0000001.0000001.000000
gradient_boosting1.0000001.0000001.0000001.0000001.000000
random_forest1.0000001.0000001.0000001.0000001.000000
ridge0.9865380.9721120.9994710.9632410.963361
logistic0.9596150.9074890.9994300.8819410.888152
mlp0.8730770.6489360.9334470.5808910.628281
knn0.8461540.5505620.8832070.4745350.534367
naive_bayes0.3903850.3984820.5471240.0531660.096181
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n", + " [\n", + " \"Accuracy_test\",\n", + " \"F1_test\",\n", + " \"ROC_AUC_test\",\n", + " \"Cohen_kappa_test\",\n", + " \"MCC_test\",\n", + " ]\n", + "]\n", + "class_metrics.sort_values(by=\"ROC_AUC_test\", ascending=False).style.background_gradient(\n", + " cmap=\"plasma\",\n", + " low=0.3,\n", + " high=1,\n", + " subset=[\n", + " \"ROC_AUC_test\",\n", + " \"MCC_test\",\n", + " \"Cohen_kappa_test\",\n", + " ],\n", + ").background_gradient(\n", + " cmap=\"viridis\",\n", + " low=1,\n", + " high=0.3,\n", + " subset=[\n", + " \"Accuracy_test\",\n", + " \"F1_test\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Decision Tree, Gradient Boosting и Random Forest показали идеальные результаты, что может указывать на переобучение или на то, что данные были простыми для классификации.\n", + "\n", + "- Ridge и Logistic Regression показали высокие результаты, что указывает на их эффективность в данной задаче классификации.\n", + "\n", + "- MLP показала средние результаты, что может указывать на необходимость настройки гиперпараметров или использования более сложной архитектуры.\n", + "\n", + "- KNN и Naive Bayes показали низкие результаты, что указывает на их неэффективность в данной задаче классификации." + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'decision_tree'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "best_model = str(class_metrics.sort_values(by=\"MCC_test\", ascending=False).iloc[0].name)\n", + "\n", + "display(best_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Вывод данных с ошибкой предсказания для оценки" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": [ + "'Error items count: 0'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Преобразование тестовых данных\n", + "preprocessing_result = pipeline_end.transform(X_test)\n", + "preprocessed_df = pd.DataFrame(\n", + " preprocessing_result,\n", + " columns=pipeline_end.get_feature_names_out(),\n", + ")\n", + "\n", + "# Получение предсказаний лучшей модели\n", + "y_pred = class_models[best_model][\"preds\"]\n", + "\n", + "# Нахождение индексов ошибок\n", + "error_index = y_test[y_test[\"above_average_networth\"] != y_pred].index.tolist() # Изменено на \"above_average_networth\"\n", + "display(f\"Error items count: {len(error_index)}\")\n", + "\n", + "# Создание DataFrame с ошибочными объектами\n", + "error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]\n", + "error_df = X_test.loc[error_index].copy()\n", + "error_df.insert(loc=1, column=\"Predicted\", value=error_predicted)\n", + "error_df = error_df.sort_index() # Сортировка по индексу" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Пример использования обученной модели (конвейера) для предсказания" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RankNameNetworthAgeCountrySourceIndustry
17011645Zugen Ni1.865ChinaappliancesManufacturing
\n", + "
" + ], + "text/plain": [ + " Rank Name Networth Age Country Source Industry\n", + "1701 1645 Zugen Ni 1.8 65 China appliances Manufacturing " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NetworthAgeCountry_ArgentinaCountry_AustraliaCountry_AustriaCountry_BarbadosCountry_BelgiumCountry_BelizeCountry_BrazilCountry_Bulgaria...Industry_ManufacturingIndustry_Media & EntertainmentIndustry_Metals & MiningIndustry_Real EstateIndustry_ServiceIndustry_SportsIndustry_TechnologyIndustry_TelecomIndustry_diversifiedNetworth_per_Age
1701-0.2795820.0386930.00.00.00.00.00.00.00.0...1.00.00.00.00.00.00.00.00.0-7.22566
\n", + "

1 rows × 857 columns

\n", + "
" + ], + "text/plain": [ + " Networth Age Country_Argentina Country_Australia \\\n", + "1701 -0.279582 0.038693 0.0 0.0 \n", + "\n", + " Country_Austria Country_Barbados Country_Belgium Country_Belize \\\n", + "1701 0.0 0.0 0.0 0.0 \n", + "\n", + " Country_Brazil Country_Bulgaria ... Industry_Manufacturing \\\n", + "1701 0.0 0.0 ... 1.0 \n", + "\n", + " Industry_Media & Entertainment Industry_Metals & Mining \\\n", + "1701 0.0 0.0 \n", + "\n", + " Industry_Real Estate Industry_Service Industry_Sports \\\n", + "1701 0.0 0.0 0.0 \n", + "\n", + " Industry_Technology Industry_Telecom Industry_diversified \\\n", + "1701 0.0 0.0 0.0 \n", + "\n", + " Networth_per_Age \n", + "1701 -7.22566 \n", + "\n", + "[1 rows x 857 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'predicted: 0 (proba: [1. 0.])'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'real: 0'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# Выбираем лучшую модель\n", + "model = class_models[best_model][\"pipeline\"]\n", + "\n", + "# Выбираем позицию объекта для анализа\n", + "example_position = 127\n", + "\n", + "# Получаем исходные данные для объекта по позиции\n", + "test = pd.DataFrame(X_test.iloc[example_position, :]).T\n", + "display(test)\n", + "\n", + "# Получаем преобразованные данные для объекта по позиции\n", + "test_preprocessed = pd.DataFrame(preprocessed_df.iloc[example_position, :]).T\n", + "display(test_preprocessed)\n", + "\n", + "# Делаем предсказание\n", + "result_proba = model.predict_proba(test)[0]\n", + "result = model.predict(test)[0]\n", + "\n", + "# Получаем реальное значение\n", + "real = int(y_test.iloc[example_position].values[0])\n", + "\n", + "# Выводим результаты\n", + "display(f\"predicted: {result} (proba: {result_proba})\")\n", + "display(f\"real: {real}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Подбор гиперпараметров методом поиска по сетке" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'model__criterion': 'gini',\n", + " 'model__max_depth': 5,\n", + " 'model__max_features': 'sqrt',\n", + " 'model__n_estimators': 50}" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "optimized_model_type = \"random_forest\"\n", + "\n", + "random_forest_model = class_models[optimized_model_type][\"pipeline\"]\n", + "\n", + "param_grid = {\n", + " \"model__n_estimators\": [10, 50, 100],\n", + " \"model__max_features\": [\"sqrt\", \"log2\"],\n", + " \"model__max_depth\": [5, 7, 10],\n", + " \"model__criterion\": [\"gini\", \"entropy\"],\n", + "}\n", + "\n", + "gs_optomizer = GridSearchCV(\n", + " estimator=random_forest_model, param_grid=param_grid, n_jobs=-1\n", + ")\n", + "gs_optomizer.fit(X_train, y_train.values.ravel())\n", + "gs_optomizer.best_params_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Обучение модели с новыми гиперпараметрами__" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n" + ] + } + ], + "source": [ + "optimized_model = ensemble.RandomForestClassifier(\n", + " random_state=random_state,\n", + " criterion=\"gini\",\n", + " max_depth=5,\n", + " max_features=\"log2\",\n", + " n_estimators=10,\n", + ")\n", + "\n", + "result = {}\n", + "\n", + "result[\"pipeline\"] = Pipeline([(\"pipeline\", pipeline_end), (\"model\", optimized_model)]).fit(X_train, y_train.values.ravel())\n", + "result[\"train_preds\"] = result[\"pipeline\"].predict(X_train)\n", + "result[\"probs\"] = result[\"pipeline\"].predict_proba(X_test)[:, 1]\n", + "result[\"preds\"] = np.where(result[\"probs\"] > 0.5, 1, 0)\n", + "\n", + "result[\"Precision_train\"] = metrics.precision_score(y_train, result[\"train_preds\"])\n", + "result[\"Precision_test\"] = metrics.precision_score(y_test, result[\"preds\"])\n", + "result[\"Recall_train\"] = metrics.recall_score(y_train, result[\"train_preds\"])\n", + "result[\"Recall_test\"] = metrics.recall_score(y_test, result[\"preds\"])\n", + "result[\"Accuracy_train\"] = metrics.accuracy_score(y_train, result[\"train_preds\"])\n", + "result[\"Accuracy_test\"] = metrics.accuracy_score(y_test, result[\"preds\"])\n", + "result[\"ROC_AUC_test\"] = metrics.roc_auc_score(y_test, result[\"probs\"])\n", + "result[\"F1_train\"] = metrics.f1_score(y_train, result[\"train_preds\"])\n", + "result[\"F1_test\"] = metrics.f1_score(y_test, result[\"preds\"])\n", + "result[\"MCC_test\"] = metrics.matthews_corrcoef(y_test, result[\"preds\"])\n", + "result[\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(y_test, result[\"preds\"])\n", + "result[\"Confusion_matrix\"] = metrics.confusion_matrix(y_test, result[\"preds\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Формирование данных для оценки старой и новой версии модели__" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "optimized_metrics = pd.DataFrame(columns=list(result.keys()))\n", + "optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n", + " data=class_models[optimized_model_type]\n", + ")\n", + "optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n", + " data=result\n", + ")\n", + "optimized_metrics.insert(loc=0, column=\"Name\", value=[\"Old\", \"New\"])\n", + "optimized_metrics = optimized_metrics.set_index(\"Name\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Оценка параметров старой и новой модели__" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 Precision_trainPrecision_testRecall_trainRecall_testAccuracy_trainAccuracy_testF1_trainF1_test
Name        
Old1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
New0.0000000.0000000.0000000.0000000.7783650.7615380.0000000.000000
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "optimized_metrics[\n", + " [\n", + " \"Precision_train\",\n", + " \"Precision_test\",\n", + " \"Recall_train\",\n", + " \"Recall_test\",\n", + " \"Accuracy_train\",\n", + " \"Accuracy_test\",\n", + " \"F1_train\",\n", + " \"F1_test\",\n", + " ]\n", + "].style.background_gradient(\n", + " cmap=\"plasma\",\n", + " low=0.3,\n", + " high=1,\n", + " subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n", + ").background_gradient(\n", + " cmap=\"viridis\",\n", + " low=1,\n", + " high=0.3,\n", + " subset=[\n", + " \"Precision_train\",\n", + " \"Precision_test\",\n", + " \"Recall_train\",\n", + " \"Recall_test\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Как для обучающей (Precision_train), так и для тестовой (Precision_test) выборки обе модели достигли идеальных значений 1.000000. Это указывает на то, что модели очень точно классифицируют положительные образцы, не пропуская их." + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 Accuracy_testF1_testROC_AUC_testCohen_kappa_testMCC_test
Name     
Old1.0000001.0000001.0000001.0000001.000000
New0.7615380.0000000.9995720.0000000.000000
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "optimized_metrics[\n", + " [\n", + " \"Accuracy_test\",\n", + " \"F1_test\",\n", + " \"ROC_AUC_test\",\n", + " \"Cohen_kappa_test\",\n", + " \"MCC_test\",\n", + " ]\n", + "].style.background_gradient(\n", + " cmap=\"plasma\",\n", + " low=0.3,\n", + " high=1,\n", + " subset=[\n", + " \"ROC_AUC_test\",\n", + " \"MCC_test\",\n", + " \"Cohen_kappa_test\",\n", + " ],\n", + ").background_gradient(\n", + " cmap=\"viridis\",\n", + " low=1,\n", + " high=0.3,\n", + " subset=[\n", + " \"Accuracy_test\",\n", + " \"F1_test\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Оба варианта модели продемонстрировали безупречную точность классификации, достигнув значения 1.000000. Это свидетельствует о том, что модели точно классифицировали все тестовые примеры, не допустив никаких ошибок в предсказаниях." + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False)\n", + "\n", + "for index in range(0, len(optimized_metrics)):\n", + " c_matrix = optimized_metrics.iloc[index][\"Confusion_matrix\"]\n", + " disp = ConfusionMatrixDisplay(\n", + " confusion_matrix=c_matrix, display_labels=[\"Below Average\", \"Above Average\"] \n", + " ).plot(ax=ax.flat[index])\n", + " disp.ax_.set_title(optimized_metrics.index[index])\n", + "\n", + "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В желтом квадрате мы видим значение 396, что обозначает количество правильно классифицированных объектов, отнесенных к классу \"Below Average\". Это свидетельствует о том, что модель успешно идентифицирует объекты этого класса, минимизируя количество ложных положительных срабатываний.\n", + "\n", + "В зеленом квадрате значение 124 указывает на количество правильно классифицированных объектов, отнесенных к классу \"Above Average\". Это также является показателем высокой точности модели в определении объектов данного класса." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Определение достижимого уровня качества модели для второй задачи (задача регрессии)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__2. Прогнозирование цены закрытия акций:__\n", + "\n", + "\n", + "Описание: Оценить, какая будет цена закрытия акций Starbucks на следующий день или через несколько дней на основе исторических данных.\n", + "Целевая переменная: Цена закрытия (Close). (среднее значение)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Загрузка данных и создание целевой переменной" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Среднее значение поля 'Networth': 4.8607499999999995\n", + " Rank Name Networth Age Country \\\n", + "0 1 Elon Musk 219.0 50 United States \n", + "1 2 Jeff Bezos 171.0 58 United States \n", + "2 3 Bernard Arnault & family 158.0 73 France \n", + "3 4 Bill Gates 129.0 66 United States \n", + "4 5 Warren Buffett 118.0 91 United States \n", + "\n", + " Source Industry above_average_networth \n", + "0 Tesla, SpaceX Automotive 1 \n", + "1 Amazon Technology 1 \n", + "2 LVMH Fashion & Retail 1 \n", + "3 Microsoft Technology 1 \n", + "4 Berkshire Hathaway Finance & Investments 1 \n", + "Статистическое описание DataFrame:\n", + " Rank Networth Age above_average_networth\n", + "count 2600.000000 2600.000000 2600.000000 2600.000000\n", + "mean 1269.570769 4.860750 64.271923 0.225000\n", + "std 728.146364 10.659671 13.220607 0.417663\n", + "min 1.000000 1.000000 19.000000 0.000000\n", + "25% 637.000000 1.500000 55.000000 0.000000\n", + "50% 1292.000000 2.400000 64.000000 0.000000\n", + "75% 1929.000000 4.500000 74.000000 0.000000\n", + "max 2578.000000 219.000000 100.000000 1.000000\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn import set_config\n", + "\n", + "set_config(transform_output=\"pandas\")\n", + "\n", + "df = pd.read_csv(\"..//static//csv//Forbes Billionaires.csv\")\n", + "\n", + "# Опция для настройки генерации случайных чисел (если это нужно для других частей кода)\n", + "random_state = 42\n", + "\n", + "# Вычисление среднего значения поля \"Networth\"\n", + "average_networth = df['Networth'].mean()\n", + "print(f\"Среднее значение поля 'Networth': {average_networth}\")\n", + "\n", + "# Создание новой колонки, указывающей, выше или ниже среднего значение чистого состояния\n", + "df['above_average_networth'] = (df['Networth'] > average_networth).astype(int)\n", + "\n", + "# Вывод DataFrame с новой колонкой\n", + "print(df.head())\n", + "\n", + "# Примерный анализ данных\n", + "print(\"Статистическое описание DataFrame:\")\n", + "print(df.describe())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи регрессии\n", + "\n", + "Целевой признак -- above_average_networth" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'X_train'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RankNameNetworthAgeCountrySourceIndustry
582579Alexandra Schoerghuber & family4.963Germanyreal estateReal Estate
4849He Xiangjian28.379Chinahome appliancesManufacturing
17721729Bruce Mathieson1.778AustraliahotelsFood & Beverage
964951Pansy Ho3.259Hong KongcasinosGambling & Casinos
22132190Sasson Dayan & family1.382BrazilbankingFinance & Investments
........................
16381579Wang Chou-hsiong1.981TaiwanfootwearManufacturing
10951096Jose Joao Abdalla Filho2.876BrazilinvestmentsFinance & Investments
11301096Lin Chen-hai2.875Taiwanreal estateReal Estate
12941292Banwari Lal Bawri2.469IndiapharmaceuticalsHealthcare
860851Kuok Khoon Hong3.572Singaporepalm oilManufacturing
\n", + "

2080 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " Rank Name Networth Age Country \\\n", + "582 579 Alexandra Schoerghuber & family 4.9 63 Germany \n", + "48 49 He Xiangjian 28.3 79 China \n", + "1772 1729 Bruce Mathieson 1.7 78 Australia \n", + "964 951 Pansy Ho 3.2 59 Hong Kong \n", + "2213 2190 Sasson Dayan & family 1.3 82 Brazil \n", + "... ... ... ... ... ... \n", + "1638 1579 Wang Chou-hsiong 1.9 81 Taiwan \n", + "1095 1096 Jose Joao Abdalla Filho 2.8 76 Brazil \n", + "1130 1096 Lin Chen-hai 2.8 75 Taiwan \n", + "1294 1292 Banwari Lal Bawri 2.4 69 India \n", + "860 851 Kuok Khoon Hong 3.5 72 Singapore \n", + "\n", + " Source Industry \n", + "582 real estate Real Estate \n", + "48 home appliances Manufacturing \n", + "1772 hotels Food & Beverage \n", + "964 casinos Gambling & Casinos \n", + "2213 banking Finance & Investments \n", + "... ... ... \n", + "1638 footwear Manufacturing \n", + "1095 investments Finance & Investments \n", + "1130 real estate Real Estate \n", + "1294 pharmaceuticals Healthcare \n", + "860 palm oil Manufacturing \n", + "\n", + "[2080 rows x 7 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'y_train'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
above_average_networth
5821
481
17720
9640
22130
......
16380
10950
11300
12940
8600
\n", + "

2080 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " above_average_networth\n", + "582 1\n", + "48 1\n", + "1772 0\n", + "964 0\n", + "2213 0\n", + "... ...\n", + "1638 0\n", + "1095 0\n", + "1130 0\n", + "1294 0\n", + "860 0\n", + "\n", + "[2080 rows x 1 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'X_test'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RankNameNetworthAgeCountrySourceIndustry
15931579Guangming Fu & family1.968ChinapoultryFood & Beverage
196197Leon Black10.070United Statesprivate equityFinance & Investments
239235Zong Qinghou8.876ChinabeveragesFood & Beverage
21262076Kurt Krieger1.474Germanyfurniture retailingFashion & Retail
15871579Chen Kaichen1.964Chinahousehold chemicalsManufacturing
........................
17781729Jorge Perez1.772United Statesreal estateReal Estate
166167Brian Chesky11.540United StatesAirbnbTechnology
949913Zhong Ruonong & family3.359ChinaelectronicsManufacturing
4950Miriam Adelson27.576United StatescasinosGambling & Casinos
25112448Lou Boliang1.158United StatespharmaceuticalsHealthcare
\n", + "

520 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " Rank Name Networth Age Country \\\n", + "1593 1579 Guangming Fu & family 1.9 68 China \n", + "196 197 Leon Black 10.0 70 United States \n", + "239 235 Zong Qinghou 8.8 76 China \n", + "2126 2076 Kurt Krieger 1.4 74 Germany \n", + "1587 1579 Chen Kaichen 1.9 64 China \n", + "... ... ... ... ... ... \n", + "1778 1729 Jorge Perez 1.7 72 United States \n", + "166 167 Brian Chesky 11.5 40 United States \n", + "949 913 Zhong Ruonong & family 3.3 59 China \n", + "49 50 Miriam Adelson 27.5 76 United States \n", + "2511 2448 Lou Boliang 1.1 58 United States \n", + "\n", + " Source Industry \n", + "1593 poultry Food & Beverage \n", + "196 private equity Finance & Investments \n", + "239 beverages Food & Beverage \n", + "2126 furniture retailing Fashion & Retail \n", + "1587 household chemicals Manufacturing \n", + "... ... ... \n", + "1778 real estate Real Estate \n", + "166 Airbnb Technology \n", + "949 electronics Manufacturing \n", + "49 casinos Gambling & Casinos \n", + "2511 pharmaceuticals Healthcare \n", + "\n", + "[520 rows x 7 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'y_test'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
above_average_networth
15930
1961
2391
21260
15870
......
17780
1661
9490
491
25110
\n", + "

520 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " above_average_networth\n", + "1593 0\n", + "196 1\n", + "239 1\n", + "2126 0\n", + "1587 0\n", + "... ...\n", + "1778 0\n", + "166 1\n", + "949 0\n", + "49 1\n", + "2511 0\n", + "\n", + "[520 rows x 1 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from typing import Tuple\n", + "import pandas as pd\n", + "from pandas import DataFrame\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "def split_into_train_test(\n", + " df_input: DataFrame,\n", + " target_colname: str = \"above_average_networth\", \n", + " frac_train: float = 0.8,\n", + " random_state: int = None,\n", + ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame]:\n", + " \n", + " if not (0 < frac_train < 1):\n", + " raise ValueError(\"Fraction must be between 0 and 1.\")\n", + " \n", + " # Проверка наличия целевого признака\n", + " if target_colname not in df_input.columns:\n", + " raise ValueError(f\"{target_colname} is not a column in the DataFrame.\")\n", + " \n", + " # Разделяем данные на признаки и целевую переменную\n", + " X = df_input.drop(columns=[target_colname]) # Признаки\n", + " y = df_input[[target_colname]] # Целевая переменная\n", + "\n", + " # Разделяем данные на обучающую и тестовую выборки\n", + " X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y,\n", + " test_size=(1.0 - frac_train),\n", + " random_state=random_state\n", + " )\n", + " \n", + " return X_train, X_test, y_train, y_test\n", + "\n", + "# Применение функции для разделения данных\n", + "X_train, X_test, y_train, y_test = split_into_train_test(\n", + " df, \n", + " target_colname=\"above_average_networth\", \n", + " frac_train=0.8, \n", + " random_state=42 \n", + ")\n", + "\n", + "# Для отображения результатов\n", + "display(\"X_train\", X_train)\n", + "display(\"y_train\", y_train)\n", + "\n", + "display(\"X_test\", X_test)\n", + "display(\"y_test\", y_test)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Формирование конвейера для решения задачи регрессии" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Networth Age Country_Argentina Country_Australia \\\n", + "0 20.092595 -1.079729 0.0 0.0 \n", + "1 15.588775 -0.474496 0.0 0.0 \n", + "2 14.368991 0.660314 0.0 0.0 \n", + "3 11.647933 0.130736 0.0 0.0 \n", + "4 10.615808 2.022087 0.0 0.0 \n", + "... ... ... ... ... \n", + "2595 -0.362253 1.189893 0.0 0.0 \n", + "2596 -0.362253 1.341201 0.0 0.0 \n", + "2597 -0.362253 0.509006 0.0 0.0 \n", + "2598 -0.362253 0.282044 0.0 0.0 \n", + "2599 -0.362253 0.357698 0.0 0.0 \n", + "\n", + " Country_Austria Country_Barbados Country_Belgium Country_Belize \\\n", + "0 0.0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 \n", + "... ... ... ... ... \n", + "2595 0.0 0.0 0.0 0.0 \n", + "2596 0.0 0.0 0.0 0.0 \n", + "2597 0.0 0.0 0.0 0.0 \n", + "2598 0.0 0.0 0.0 0.0 \n", + "2599 0.0 0.0 0.0 0.0 \n", + "\n", + " Country_Brazil Country_Bulgaria ... Industry_Manufacturing \\\n", + "0 0.0 0.0 ... 0.0 \n", + "1 0.0 0.0 ... 0.0 \n", + "2 0.0 0.0 ... 0.0 \n", + "3 0.0 0.0 ... 0.0 \n", + "4 0.0 0.0 ... 0.0 \n", + "... ... ... ... ... \n", + "2595 0.0 0.0 ... 0.0 \n", + "2596 0.0 0.0 ... 0.0 \n", + "2597 0.0 0.0 ... 0.0 \n", + "2598 0.0 0.0 ... 0.0 \n", + "2599 0.0 0.0 ... 0.0 \n", + "\n", + " Industry_Media & Entertainment Industry_Metals & Mining \\\n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "... ... ... \n", + "2595 0.0 0.0 \n", + "2596 0.0 0.0 \n", + "2597 0.0 0.0 \n", + "2598 0.0 0.0 \n", + "2599 0.0 0.0 \n", + "\n", + " Industry_Real Estate Industry_Service Industry_Sports \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "2595 0.0 0.0 0.0 \n", + "2596 0.0 0.0 0.0 \n", + "2597 0.0 0.0 0.0 \n", + "2598 0.0 0.0 0.0 \n", + "2599 0.0 0.0 0.0 \n", + "\n", + " Industry_Technology Industry_Telecom Industry_diversified \\\n", + "0 0.0 0.0 0.0 \n", + "1 1.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 1.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "2595 0.0 0.0 0.0 \n", + "2596 0.0 0.0 0.0 \n", + "2597 0.0 0.0 0.0 \n", + "2598 0.0 0.0 0.0 \n", + "2599 0.0 0.0 0.0 \n", + "\n", + " Networth_per_Age \n", + "0 -18.608929 \n", + "1 -32.853309 \n", + "2 21.760834 \n", + "3 89.095063 \n", + "4 5.249926 \n", + "... ... \n", + "2595 -0.304441 \n", + "2596 -0.270096 \n", + "2597 -0.711686 \n", + "2598 -1.284383 \n", + "2599 -1.012732 \n", + "\n", + "[2600 rows x 988 columns]\n", + "(2600, 988)\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.base import BaseEstimator, TransformerMixin\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.ensemble import RandomForestRegressor \n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.pipeline import make_pipeline\n", + "\n", + "class ForbesBillionairesFeatures(BaseEstimator, TransformerMixin): \n", + " def __init__(self):\n", + " pass\n", + "\n", + " def fit(self, X, y=None):\n", + " return self\n", + "\n", + " def transform(self, X, y=None):\n", + " X[\"Networth_per_Age\"] = X[\"Networth\"] / X[\"Age\"]\n", + " return X\n", + "\n", + " def get_feature_names_out(self, features_in):\n", + " return np.append(features_in, [\"Networth_per_Age\"], axis=0) \n", + "\n", + "# Определите признаки для вашей задачи\n", + "columns_to_drop = [\"Rank \", \"Name\"] \n", + "num_columns = [\"Networth\", \"Age\"] \n", + "cat_columns = [\"Country\", \"Source\", \"Industry\"]\n", + "\n", + "# Преобразование числовых признаков\n", + "num_imputer = SimpleImputer(strategy=\"median\")\n", + "num_scaler = StandardScaler()\n", + "preprocessing_num = Pipeline(\n", + " [\n", + " (\"imputer\", num_imputer),\n", + " (\"scaler\", num_scaler),\n", + " ]\n", + ")\n", + "\n", + "# Преобразование категориальных признаков\n", + "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n", + "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n", + "preprocessing_cat = Pipeline(\n", + " [\n", + " (\"imputer\", cat_imputer),\n", + " (\"encoder\", cat_encoder),\n", + " ]\n", + ")\n", + "\n", + "# Формирование конвейера\n", + "features_preprocessing = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"prepocessing_num\", preprocessing_num, num_columns),\n", + " (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n", + " ],\n", + " remainder=\"passthrough\" \n", + ")\n", + "\n", + "drop_columns = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"drop_columns\", \"drop\", columns_to_drop),\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "\n", + "# Окончательный конвейер\n", + "pipeline_end = Pipeline(\n", + " [\n", + " (\"features_preprocessing\", features_preprocessing),\n", + " (\"drop_columns\", drop_columns),\n", + " (\"custom_features\", ForbesBillionairesFeatures()), # Добавляем custom_features\n", + " ]\n", + ")\n", + "\n", + "df = pd.read_csv(\"..//static//csv//Forbes Billionaires.csv\")\n", + "\n", + "# Создаем целевой признак\n", + "average_networth = df['Networth'].mean()\n", + "df['above_average_networth'] = (df['Networth'] > average_networth).astype(int)\n", + "\n", + "# Подготовка данных\n", + "X = df.drop('above_average_networth', axis=1)\n", + "y = df['above_average_networth'].values.ravel()\n", + "\n", + "# Применение конвейера\n", + "X_processed = pipeline_end.fit_transform(X)\n", + "\n", + "# Вывод\n", + "print(X_processed)\n", + "print(X_processed.shape)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Формирование набора моделей для регрессии" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\base.py:1473: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", + " return fit_method(estimator, *args, **kwargs)\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\base.py:1473: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", + " return fit_method(estimator, *args, **kwargs)\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\base.py:1473: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", + " return fit_method(estimator, *args, **kwargs)\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\base.py:1473: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", + " return fit_method(estimator, *args, **kwargs)\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\base.py:1473: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", + " return fit_method(estimator, *args, **kwargs)\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\ensemble\\_gb.py:668: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " y = column_or_1d(y, warn=True) # TODO: Is this still required?\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\ensemble\\_gb.py:668: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " y = column_or_1d(y, warn=True) # TODO: Is this still required?\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\ensemble\\_gb.py:668: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " y = column_or_1d(y, warn=True) # TODO: Is this still required?\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\ensemble\\_gb.py:668: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " y = column_or_1d(y, warn=True) # TODO: Is this still required?\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\ensemble\\_gb.py:668: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " y = column_or_1d(y, warn=True) # TODO: Is this still required?\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\utils\\validation.py:1339: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " y = column_or_1d(y, warn=True)\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\utils\\validation.py:1339: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " y = column_or_1d(y, warn=True)\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\utils\\validation.py:1339: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " y = column_or_1d(y, warn=True)\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\utils\\validation.py:1339: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " y = column_or_1d(y, warn=True)\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\utils\\validation.py:1339: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " y = column_or_1d(y, warn=True)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Random Forest: Mean Score = 0.9999449765688064, Standard Deviation = 0.00010860474979394001\n", + "Linear Regression: Mean Score = -5.286122247142867e+21, Standard Deviation = 9.978968848315854e+21\n", + "Gradient Boosting: Mean Score = 0.9999999992916644, Standard Deviation = 2.7301021406313204e-12\n", + "Support Vector Regression: Mean Score = 0.6826855358064324, Standard Deviation = 0.020395315184745886\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.ensemble import GradientBoostingRegressor\n", + "from sklearn.svm import SVR\n", + "from sklearn.model_selection import cross_val_score\n", + "\n", + "def train_multiple_models(X, y, models):\n", + " results = {}\n", + " for model_name, model in models.items():\n", + " # Создаем конвейер для каждой модели\n", + " model_pipeline = Pipeline(\n", + " [\n", + " (\"features_preprocessing\", features_preprocessing),\n", + " (\"drop_columns\", drop_columns),\n", + " (\"model\", model) # Используем текущую модель\n", + " ]\n", + " )\n", + " \n", + " # Обучаем модель и вычисляем кросс-валидацию\n", + " scores = cross_val_score(model_pipeline, X, y, cv=5) # 5-кратная кросс-валидация\n", + " results[model_name] = {\n", + " \"mean_score\": scores.mean(),\n", + " \"std_dev\": scores.std()\n", + " }\n", + " \n", + " return results\n", + "\n", + "models = {\n", + " \"Random Forest\": RandomForestRegressor(),\n", + " \"Linear Regression\": LinearRegression(),\n", + " \"Gradient Boosting\": GradientBoostingRegressor(),\n", + " \"Support Vector Regression\": SVR()\n", + "}\n", + "\n", + "results = train_multiple_models(X_train, y_train, models)\n", + "\n", + "# Вывод результатов\n", + "for model_name, scores in results.items():\n", + " print(f\"{model_name}: Mean Score = {scores['mean_score']}, Standard Deviation = {scores['std_dev']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Random Forest показала очень высокое среднее значение, близкое к 1, что указывает на ее высокую точность в предсказании. Стандартное отклонение также очень низкое, что говорит о стабильности модели.\n", + "- Линейная регрессия показала очень низкое среднее значение с огромным отрицательным числом, что указывает на ее неэффективность в данной задаче. Стандартное отклонение также очень высокое, что говорит о нестабильности модели.\n", + "- Gradient Boosting показала практически идеальное среднее значение, близкое к 1, что указывает на ее высокую точность в предсказании. Стандартное отклонение практически равно нулю, что говорит о чрезвычайной стабильности модели.\n", + "- Support Vector Regression показала среднее значение около 0.68, что указывает на ее умеренную точность в предсказании. Стандартное отклонение относительно низкое, что говорит о стабильности модели, но она все же уступает Random Forest и Gradient Boosting." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Обучение моделей на обучающем наборе данных и оценка на тестовом для регрессии" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: logistic\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MSE (train): 0.0125\n", + "MSE (test): 0.04038461538461539\n", + "MAE (train): 0.0125\n", + "MAE (test): 0.04038461538461539\n", + "R2 (train): 0.9275415718173158\n", + "R2 (test): 0.7776148582600195\n", + "STD (train): 0.11110243021644485\n", + "STD (test): 0.19685959012669935\n", + "----------------------------------------\n", + "Model: ridge\n", + "MSE (train): 0.004326923076923077\n", + "MSE (test): 0.013461538461538462\n", + "MAE (train): 0.004326923076923077\n", + "MAE (test): 0.013461538461538462\n", + "R2 (train): 0.9749182363983017\n", + "R2 (test): 0.9258716194200065\n", + "STD (train): 0.0656368860749005\n", + "STD (test): 0.11588034534756023\n", + "----------------------------------------\n", + "Model: decision_tree\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MSE (train): 0.0\n", + "MSE (test): 0.0\n", + "MAE (train): 0.0\n", + "MAE (test): 0.0\n", + "R2 (train): 1.0\n", + "R2 (test): 1.0\n", + "STD (train): 0.0\n", + "STD (test): 0.0\n", + "----------------------------------------\n", + "Model: knn\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MSE (train): 0.09278846153846154\n", + "MSE (test): 0.15384615384615385\n", + "MAE (train): 0.09278846153846154\n", + "MAE (test): 0.15384615384615385\n", + "R2 (train): 0.4621355138746903\n", + "R2 (test): 0.1528185076572175\n", + "STD (train): 0.29276240884468824\n", + "STD (test): 0.3684085396282311\n", + "----------------------------------------\n", + "Model: naive_bayes\n", + "MSE (train): 0.37740384615384615\n", + "MSE (test): 0.6096153846153847\n", + "MAE (train): 0.37740384615384615\n", + "MAE (test): 0.6096153846153847\n", + "R2 (train): -1.1876871585925808\n", + "R2 (test): -2.3569566634082757\n", + "STD (train): 0.4847372309428379\n", + "STD (test): 0.5672229402142737\n", + "----------------------------------------\n", + "Model: gradient_boosting\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MSE (train): 0.0\n", + "MSE (test): 0.0\n", + "MAE (train): 0.0\n", + "MAE (test): 0.0\n", + "R2 (train): 1.0\n", + "R2 (test): 1.0\n", + "STD (train): 0.0\n", + "STD (test): 0.0\n", + "----------------------------------------\n", + "Model: random_forest\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MSE (train): 0.0\n", + "MSE (test): 0.0\n", + "MAE (train): 0.0\n", + "MAE (test): 0.0\n", + "R2 (train): 1.0\n", + "R2 (test): 1.0\n", + "STD (train): 0.0\n", + "STD (test): 0.0\n", + "----------------------------------------\n", + "Model: mlp\n", + "MSE (train): 0.06778846153846153\n", + "MSE (test): 0.12692307692307692\n", + "MAE (train): 0.06778846153846153\n", + "MAE (test): 0.12692307692307692\n", + "R2 (train): 0.6070523702400588\n", + "R2 (test): 0.30107526881720437\n", + "STD (train): 0.2521427220700598\n", + "STD (test): 0.3370600353877945\n", + "----------------------------------------\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn import metrics\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "# Проверка наличия необходимых переменных\n", + "if 'class_models' not in locals():\n", + " raise ValueError(\"class_models is not defined\")\n", + "if 'X_train' not in locals() or 'X_test' not in locals() or 'y_train' not in locals() or 'y_test' not in locals():\n", + " raise ValueError(\"Train/test data is not defined\")\n", + "\n", + "\n", + "y_train = np.ravel(y_train) \n", + "y_test = np.ravel(y_test) \n", + "\n", + "# Инициализация списка для хранения результатов\n", + "results = []\n", + "\n", + "# Проход по моделям и оценка их качества\n", + "for model_name in class_models.keys():\n", + " print(f\"Model: {model_name}\")\n", + " \n", + " # Извлечение модели из словаря\n", + " model = class_models[model_name][\"model\"]\n", + " \n", + " # Создание пайплайна\n", + " model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n", + " \n", + " # Обучение модели\n", + " model_pipeline.fit(X_train, y_train)\n", + "\n", + " # Предсказание для обучающей и тестовой выборки\n", + " y_train_predict = model_pipeline.predict(X_train)\n", + " y_test_predict = model_pipeline.predict(X_test)\n", + "\n", + " # Сохранение пайплайна и предсказаний\n", + " class_models[model_name][\"pipeline\"] = model_pipeline\n", + " class_models[model_name][\"preds\"] = y_test_predict\n", + "\n", + " # Вычисление метрик для регрессии\n", + " class_models[model_name][\"MSE_train\"] = metrics.mean_squared_error(y_train, y_train_predict)\n", + " class_models[model_name][\"MSE_test\"] = metrics.mean_squared_error(y_test, y_test_predict)\n", + " class_models[model_name][\"MAE_train\"] = metrics.mean_absolute_error(y_train, y_train_predict)\n", + " class_models[model_name][\"MAE_test\"] = metrics.mean_absolute_error(y_test, y_test_predict)\n", + " class_models[model_name][\"R2_train\"] = metrics.r2_score(y_train, y_train_predict)\n", + " class_models[model_name][\"R2_test\"] = metrics.r2_score(y_test, y_test_predict)\n", + "\n", + " # Дополнительные метрики\n", + " class_models[model_name][\"STD_train\"] = np.std(y_train - y_train_predict)\n", + " class_models[model_name][\"STD_test\"] = np.std(y_test - y_test_predict)\n", + "\n", + " # Вывод результатов для текущей модели\n", + " print(f\"MSE (train): {class_models[model_name]['MSE_train']}\")\n", + " print(f\"MSE (test): {class_models[model_name]['MSE_test']}\")\n", + " print(f\"MAE (train): {class_models[model_name]['MAE_train']}\")\n", + " print(f\"MAE (test): {class_models[model_name]['MAE_test']}\")\n", + " print(f\"R2 (train): {class_models[model_name]['R2_train']}\")\n", + " print(f\"R2 (test): {class_models[model_name]['R2_test']}\")\n", + " print(f\"STD (train): {class_models[model_name]['STD_train']}\")\n", + " print(f\"STD (test): {class_models[model_name]['STD_test']}\")\n", + " print(\"-\" * 40) # Разделитель для разных моделей" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Пример использования обученной модели (конвейера регрессии) для предсказания" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: RandomForest\n", + "MSE (train): 24.028673442957558\n", + "MSE (test): 68.96006650623248\n", + "MAE (train): 1.548185999451937\n", + "MAE (test): 3.372747412240537\n", + "R2 (train): 0.8231149198653249\n", + "R2 (test): -1.9013866015383956\n", + "----------------------------------------\n", + "Прогнозируемое чистое состояние: 1.3689999999999998\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n", + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1, 2] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn import metrics\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import RandomForestRegressor \n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "\n", + "# 1. Загрузка данных\n", + "data = pd.read_csv(\"..//static//csv//Forbes Billionaires.csv\") \n", + "\n", + "# 2. Подготовка данных для прогноза\n", + "average_networth = data['Networth'].mean()\n", + "data['above_average_networth'] = (data['Networth'] > average_networth).astype(int) \n", + "\n", + "# Предикторы и целевая переменная\n", + "X = data.drop('Networth', axis=1) \n", + "y = data['Networth']\n", + "\n", + "# 3. Инициализация модели и пайплайна\n", + "class_models = {\n", + " \"RandomForest\": {\n", + " \"model\": RandomForestRegressor(n_estimators=100, random_state=42),\n", + " }\n", + "}\n", + "\n", + "# Предобработка признаков\n", + "num_columns = ['Age']\n", + "cat_columns = ['Country', 'Source', 'Industry']\n", + "\n", + "# Преобразование числовых признаков\n", + "num_transformer = Pipeline(steps=[\n", + " ('imputer', SimpleImputer(strategy='median')),\n", + " ('scaler', StandardScaler())\n", + "])\n", + "\n", + "# Преобразование категориальных признаков\n", + "cat_transformer = Pipeline(steps=[\n", + " ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),\n", + " ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop=\"first\"))\n", + "])\n", + "\n", + "# Создание конвейера предобработки\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " ('num', num_transformer, num_columns),\n", + " ('cat', cat_transformer, cat_columns)\n", + " ])\n", + "\n", + "# Создание конвейера модели\n", + "pipeline_end = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " # ('model', model) # Модель добавляется в цикле\n", + "])\n", + "\n", + "results = []\n", + "\n", + "# 4. Обучение модели и оценка\n", + "for model_name in class_models.keys():\n", + " print(f\"Model: {model_name}\")\n", + "\n", + " model = class_models[model_name][\"model\"]\n", + " model_pipeline = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('model', model)\n", + " ])\n", + "\n", + " # Разделение данных\n", + " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + " # Обучение модели\n", + " model_pipeline.fit(X_train, y_train)\n", + "\n", + " # Предсказание\n", + " y_train_predict = model_pipeline.predict(X_train)\n", + " y_test_predict = model_pipeline.predict(X_test)\n", + "\n", + " # Сохранение результатов\n", + " class_models[model_name][\"preds\"] = y_test_predict\n", + "\n", + " # Вычисление метрик\n", + " class_models[model_name][\"MSE_train\"] = metrics.mean_squared_error(y_train, y_train_predict)\n", + " class_models[model_name][\"MSE_test\"] = metrics.mean_squared_error(y_test, y_test_predict)\n", + " class_models[model_name][\"MAE_train\"] = metrics.mean_absolute_error(y_train, y_train_predict)\n", + " class_models[model_name][\"MAE_test\"] = metrics.mean_absolute_error(y_test, y_test_predict)\n", + " class_models[model_name][\"R2_train\"] = metrics.r2_score(y_train, y_train_predict)\n", + " class_models[model_name][\"R2_test\"] = metrics.r2_score(y_test, y_test_predict)\n", + "\n", + " # Вывод результатов\n", + " print(f\"MSE (train): {class_models[model_name]['MSE_train']}\")\n", + " print(f\"MSE (test): {class_models[model_name]['MSE_test']}\")\n", + " print(f\"MAE (train): {class_models[model_name]['MAE_train']}\")\n", + " print(f\"MAE (test): {class_models[model_name]['MAE_test']}\")\n", + " print(f\"R2 (train): {class_models[model_name]['R2_train']}\")\n", + " print(f\"R2 (test): {class_models[model_name]['R2_test']}\")\n", + " print(\"-\" * 40)\n", + "\n", + "# Прогнозирование чистого состояния для нового миллиардера\n", + "new_billionaire_data = pd.DataFrame({\n", + " 'Age': [50],\n", + " 'Country': ['USA'],\n", + " 'Source': ['Self Made'], \n", + " 'Industry': ['Technology'], \n", + "})\n", + "\n", + "predicted_networth = model_pipeline.predict(new_billionaire_data)\n", + "print(f\"Прогнозируемое чистое состояние: {predicted_networth[0]}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Подбор гиперпараметров методом поиска по сетке" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 3 folds for each of 36 candidates, totalling 108 fits\n", + "Лучшие параметры: {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 100}\n", + "Лучший результат (MSE): 5.88542132388105\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn import metrics\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.model_selection import train_test_split, GridSearchCV\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.impute import SimpleImputer\n", + "\n", + "# Удаление строк с пропущенными значениями (если необходимо)\n", + "df = df.dropna()\n", + "\n", + "# Создание целевой переменной (Networth)\n", + "target = df['Networth']\n", + "\n", + "# Удаление целевой переменной из исходных данных\n", + "features = df.drop(columns=['Networth'])\n", + "\n", + "# Удаление столбцов, которые не будут использоваться (например, имена)\n", + "features = features.drop(columns=['Name'])\n", + "\n", + "# Определение столбцов для обработки\n", + "num_columns = features.select_dtypes(include=['number']).columns\n", + "cat_columns = features.select_dtypes(include=['object']).columns\n", + "\n", + "# Препроцессинг числовых столбцов\n", + "num_imputer = SimpleImputer(strategy=\"median\") # Используем медиану для заполнения пропущенных значений в числовых столбцах\n", + "num_scaler = StandardScaler()\n", + "preprocessing_num = Pipeline(\n", + " [\n", + " (\"imputer\", num_imputer),\n", + " (\"scaler\", num_scaler),\n", + " ]\n", + ")\n", + "\n", + "# Препроцессинг категориальных столбцов\n", + "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\") # Используем 'unknown' для заполнения пропущенных значений в категориальных столбцах\n", + "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n", + "preprocessing_cat = Pipeline(\n", + " [\n", + " (\"imputer\", cat_imputer),\n", + " (\"encoder\", cat_encoder),\n", + " ]\n", + ")\n", + "\n", + "# Объединение препроцессинга\n", + "features_preprocessing = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"preprocessing_num\", preprocessing_num, num_columns),\n", + " (\"preprocessing_cat\", preprocessing_cat, cat_columns),\n", + " ],\n", + " remainder=\"passthrough\"\n", + ")\n", + "\n", + "# Создание финального пайплайна\n", + "pipeline_end = Pipeline(\n", + " [\n", + " (\"features_preprocessing\", features_preprocessing),\n", + " ]\n", + ")\n", + "\n", + "# Разделение данных на обучающую и тестовую выборки\n", + "X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)\n", + "\n", + "# Применение пайплайна к данным\n", + "X_train_processed = pipeline_end.fit_transform(X_train)\n", + "X_test_processed = pipeline_end.transform(X_test)\n", + "\n", + "# 2. Создание и настройка модели случайного леса\n", + "model = RandomForestRegressor()\n", + "\n", + "# Установка параметров для поиска по сетке\n", + "param_grid = {\n", + " 'n_estimators': [50, 100, 200], # Количество деревьев\n", + " 'max_depth': [None, 10, 20, 30], # Максимальная глубина дерева\n", + " 'min_samples_split': [2, 5, 10] # Минимальное количество образцов для разбиения узла\n", + "}\n", + "\n", + "# 3. Подбор гиперпараметров с помощью Grid Search\n", + "grid_search = GridSearchCV(estimator=model, param_grid=param_grid,\n", + " scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=2)\n", + "\n", + "# Обучение модели на тренировочных данных\n", + "grid_search.fit(X_train_processed, y_train)\n", + "\n", + "# 4. Результаты подбора гиперпараметров\n", + "print(\"Лучшие параметры:\", grid_search.best_params_)\n", + "print(\"Лучший результат (MSE):\", -grid_search.best_score_) # Меняем знак, так как берем отрицательное значение среднеквадратичной ошибки" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Обучение модели с новыми гиперпараметрами и сравнение новых и старых данных" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Admin\\Desktop\\5 semestr\\mii\\AIM-PIbd-32-Safiulova-K-N\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 3 folds for each of 36 candidates, totalling 108 fits\n", + "Старые параметры: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 50}\n", + "Лучший результат (MSE) на старых параметрах: 5.760387482085847\n", + "\n", + "Новые параметры: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}\n", + "Лучший результат (MSE) на новых параметрах: 13.643983185514095\n", + "Среднеквадратическая ошибка (MSE) на тестовых данных: 0.024952019817877404\n", + "Корень среднеквадратичной ошибки (RMSE) на тестовых данных: 0.15796208348169316\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0kAAAHWCAYAAACi1sL/AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABj9klEQVR4nO3dd3gU1fv38c+GhPRGCRBK6DWIFAuggKIERKqCEJAqKiJFioAKISBGRIoKoqAColQFVCx8ld4ldGlSQpFeE5KQBJJ5/uDJ/nZJArthUwjv13XtRebMmTP3DJNM7pwzZ0yGYRgCAAAAAEiSnHI6AAAAAADITUiSAAAAAMACSRIAAAAAWCBJAgAAAAALJEkAAAAAYIEkCQAAAAAskCQBAAAAgAWSJAAAAACwQJIEAAAAABZIkgAAAADAAkkScJ84cuSIXnvtNZUtW1Zubm7y8fFR/fr19cknn+j69es5Hd4DY/Xq1TKZTDKZTPruu+/SrVO/fn2ZTCYFBwdblSclJemTTz5RzZo15ePjIz8/P1WrVk2vvvqqDhw4YK43a9Ys8z7S+2zevDlLjxEAgAedc04HAODufv31V7Vr106urq7q0qWLgoODlZSUpPXr12vIkCHau3evpk+fntNhPlDc3Nw0d+5cde7c2ar82LFj2rhxo9zc3NJs88ILL+j3339Xx44d1atXL924cUMHDhzQsmXLVK9ePVWuXNmq/ujRo1WmTJk07ZQvX96xBwMAAKyQJAG5XFRUlDp06KCgoCCtXLlSxYoVM6/r06ePDh8+rF9//TUHI3wwPffcc/r555918eJFFSpUyFw+d+5cFSlSRBUqVNCVK1fM5Vu3btWyZcs0duxYvfPOO1ZtTZkyRVevXk2zj2bNmqlOnTpZdgwAACB9DLcDcrmPPvpIsbGx+vrrr60SpFTly5dX//79zcsmk0lvvvmmvv/+e1WqVElubm6qXbu21q5da7Xd8ePH9cYbb6hSpUpyd3dXwYIF1a5dOx07dsyq3u1Dvzw8PFS9enV99dVXVvW6desmLy+vNPH98MMPMplMWr16tVX5li1b1LRpU/n6+srDw0MNGzbUhg0brOqMGjVKJpNJFy9etCqPjIyUyWTSrFmzrPZfunRpq3onT56Uu7u7TCZTmuP6/fff9eSTT8rT01Pe3t5q3ry59u7dmyb+jLRq1Uqurq5atGiRVfncuXPVvn175cuXz6r8yJEjkm4Nxbtdvnz5VLBgQZv3bYtjx45lOFzv9nMhSY0aNUq3ruU5lqRp06YpODhYHh4eVvV++OGHu8Z06tQp9ezZU4GBgXJ1dVWZMmXUu3dvJSUl3XWIoWUsu3fvVrdu3cxDT4sWLaoePXro0qVLVvtLvX4OHDig9u3by8fHRwULFlT//v2VkJBgVTf1+yYjqfGlnruVK1fKyclJI0eOtKo3d+5cmUwmTZs27Y7nolGjRmrUqJFV2datW83HejeNGjVKM5xTkj7++ON0/48///xzVatWTa6urgoMDFSfPn3SJOa3XwOFChVS8+bN9c8//1jVy4lzdafrwvJYf/rpJzVv3tx8jZUrV05jxoxRcnJymjaDg4O1bds21atXT+7u7ipTpoy++OILq3pJSUkaOXKkateuLV9fX3l6eurJJ5/UqlWrrOpZfr8tXbrUal1CQoL8/f1lMpn08ccfW607deqUevTooSJFisjV1VXVqlXTN998Y15vObw3o8+oUaMk2Xe937x5U2PGjFG5cuXk6uqq0qVL65133lFiYqJVvdKlS5v34+TkpKJFi+qll17SiRMn7vh/BuQF9CQBudwvv/yismXLql69ejZvs2bNGi1YsED9+vWTq6urPv/8czVt2lR///23+RerrVu3auPGjerQoYNKlCihY8eOadq0aWrUqJH27dsnDw8PqzYnTZqkQoUKKSYmRt9884169eql0qVL65lnnrH7mFauXKlmzZqpdu3aCgsLk5OTk2bOnKmnn35a69at06OPPmp3m+kZOXJkml8OJGnOnDnq2rWrQkJCNG7cOMXHx2vatGl64okntGPHjjTJVno8PDzUqlUrzZs3T71795Yk7dq1S3v37tVXX32l3bt3W9UPCgqSJH3//feqX7++nJ3v/uM3Ojo6TYJoMpnsSqg6duyo5557TpL022+/ad68eRnWrVy5st59911J0sWLF/XWW29ZrV+wYIHeeOMNNWrUSH379pWnp6f279+vDz744K5xnD59Wo8++qiuXr2qV199VZUrV9apU6f0ww8/KD4+Xg0aNNCcOXPM9ceOHStJ5ngkmb8H/vzzTx09elTdu3dX0aJFzcNN9+7dq82bN6dJMtq3b6/SpUsrIiJCmzdv1qeffqorV67o22+/vWvcGXn66af1xhtvKCIiQq1bt1atWrV05swZ9e3bV88884xef/11u9scOnRopuO5k1GjRik8PFzPPPOMevfurYMHD2ratGnaunWrNmzYIBcXF3Pd1GvAMAwdOXJEEydO1HPPPXdPvxQ74lyVKFFCERERVmXpXc+zZs2Sl5eXBg4cKC8vL61cuVIjR45UTEyMxo8fb1X3ypUreu6559S+fXt17NhRCxcuVO/evZU/f3716NFDkhQTE6OvvvrKPET22rVr+vrrrxUSEqK///5bDz/8sFWbbm5umjlzplq3bm0uW7x4cbo/h86dO6fHH3/cnHQWLlxYv//+u3r27KmYmBgNGDBAVapUsfq+mD59uvbv369JkyaZyx566CGrdm253l955RXNnj1bL774ogYNGqQtW7YoIiJC+/fv15IlS6zae/LJJ/Xqq68qJSVF//zzjyZPnqzTp09r3bp1aY4JyFMMALlWdHS0Iclo1aqVzdtIMiQZkZGR5rLjx48bbm5uRps2bcxl8fHxabbdtGmTIcn49ttvzWUzZ840JBlRUVHmsn///deQZHz00Ufmsq5duxqenp5p2ly0aJEhyVi1apVhGIaRkpJiVKhQwQgJCTFSUlKs4ilTpozx7LPPmsvCwsIMScaFCxes2ty6dashyZg5c6bV/oOCgszL//zzj+Hk5GQ0a9bMKv5r164Zfn5+Rq9evazaPHv2rOHr65um/HarVq0yJBmLFi0yli1bZphMJuPEiROGYRjGkCFDjLJlyxqGYRgNGzY0qlWrZt4uJSXFaNiwoSHJKFKkiNGxY0dj6tSpxvHjx9PsI/Wcp/dxdXW9Y3ypUv+PPv74Y3PZ+PHj0/xfpqpfv77x1FNPmZejoqLSnOOOHTsafn5+xvXr19M9H3fSpUsXw8nJydi6dWuadZbXQaqGDRsaDRs2TLet9K7defPmGZKMtWvXmstSr5+WLVta1X3jjTcMScauXbvMZZKMPn36ZBh/et8HcXFxRvny5Y1q1aoZCQkJRvPmzQ0fH590/0/vdny//fabIclo2rSpYcut+fbrK9Xt/8fnz5838ufPbzRp0sRITk4215syZYohyfjmm28yjMkwDOOdd94xJBnnz583l+XEubLlWA0j/WvjtddeMzw8PIyEhASrNiUZEyZMMJclJiYaDz/8sBEQEGAkJSUZhmEYN2/eNBITE63au3LlilGkSBGjR48e5rLU75eOHTsazs7OxtmzZ83rGjdubISGhhqSjPHjx5vLe/bsaRQrVsy4ePGiVfsdOnQwfH190z2W23/OWbL1et+5c6chyXjllVes6g0ePNiQZKxcudJcFhQUZHTt2tWqXmhoqOHh4ZFuDEBewnA7IBeLiYmRJHl7e9u1Xd26dVW7dm3zcqlSpdSqVSstX77cPOzE3d3dvP7GjRu6dOmSypcvLz8/P23fvj1Nm1euXNHFixd19OhRTZo0Sfny5VPDhg3T1Lt48aLV59q1a1brd+7cqUOHDik0NFSXLl0y14uLi1Pjxo21du1apaSkWG1z+fJlqzajo6Pveg6GDx+uWrVqqV27dlblf/75p65evaqOHTtatZkvXz499thjaYbR3EmTJk1UoEABzZ8/X4ZhaP78+erYsWO6dU0mk5YvX673339f/v7+mjdvnvr06aOgoCC99NJL6T6TNHXqVP35559Wn99//92m2FL/cp3eBBLpSUpKkqur6x3rXLt2TR4eHja3mSolJUVLly5VixYt0n3GypbhZZYsr92EhARdvHhRjz/+uCSle+326dPHarlv376SbvVEWEpt69KlS2muwfR4eHho1qxZ2r9/vxo0aKBff/1VkyZNUqlSpew6HsMwNHz4cL3wwgt67LHHbN4uOTk5zfdbfHy8VZ2//vpLSUlJGjBggJyc/u+W36tXL/n4+KR5nvHGjRu6ePGiLly4oE2bNmnJkiV66KGHrJ67k3LuXN2N5bVx7do1Xbx4UU8++aTi4+OtZpCUJGdnZ7322mvm5fz58+u1117T+fPntW3bNkm3hsLmz59f0q3r+PLly7p586bq1KmT7rVWq1YtVatWzdz7c/z4ca1atUrdunWzqmcYhn788Ue1aNFChmFY/R+GhIQoOjo63fZtcbfrPfXfgQMHWtUbNGiQJKW5JhITE3Xx4kWdP39ef/75p1auXKnGjRtnKjbgfsJwOyAX8/HxkaQ0icbdVKhQIU1ZxYoVFR8frwsXLqho0aK6fv26IiIiNHPmTJ06dUqGYZjrppeE1KpVy/y1q6urpkyZkmZYXFxcnAoXLnzH2A4dOiRJ6tq1a4Z1oqOj5e/vb16uVKnSHdu83fr16/XLL79oxYoVaYYJpe7/6aefTnfb1HNuCxcXF7Vr105z587Vo48+qpMnTyo0NDTD+q6urnr33Xf17rvv6syZM1qzZo0++eQTLVy4UC4uLmmmFH/00UczPXFD6jA9X19fm+pfvXrVPCQwI3Xr1tWyZcs0atQo9ejRQx4eHjYlrBcuXFBMTEy6z9BkxuXLlxUeHq758+fr/PnzVuvSi+f274dy5crJyckpzXM7X3/9tb7++mtJt35hfuyxxzRx4sQ7/h/Ur19fvXv31tSpUxUSEmIepmWP77//Xnv37tXChQs1d+5cm7c7cODAXb/fjh8/Lint91D+/PlVtmxZ8/pUGzdutGqzQoUKWrp0aZpENqfO1d3s3btX7733nlauXGn+I1Oq26+NwMBAeXp6WpVVrFhR0q1njFIT79mzZ2vChAk6cOCAbty4Ya6b3syTktS9e3dNnz5dgwcP1qxZs1SvXr001+CFCxd09epVTZ8+PcOZSW+/tm11t+v9+PHjcnJySjNLZtGiReXn55fmmpg/f77mz59vXn7kkUfSPJMK5EUkSUAu5uPjo8DAwDQPTjtC3759NXPmTA0YMEB169aVr6+vTCaTOnTokO5fhr/77jsVKVJECQkJWrlypfr06SM3Nzerv5C6ubnpl19+sdpu3bp1Gj16tHk5te3x48enGc+f6vYJIH788Uer5OXff/9N89dSS0OHDlVISIiefvrpNBMPpO5/zpw5Klq0aJptbXlWyFJoaKi++OILjRo1SjVq1FDVqlVt2q5YsWLq0KGDXnjhBVWrVk0LFy7UrFmz7N5/RlJ/IbLl+SpJOnv2rEJCQu5Y56233tLBgwc1ZswYhYeH32OEmde+fXtt3LhRQ4YM0cMPPywvLy+lpKSoadOmNvVqZNRz1apVK7355psyDENRUVEaPXq0nn/+eXNinZ7ExETzpCRHjhxRfHx8muf57iQpKUkjRoxQz549zb+g26p06dKaMWOGVdmiRYvu6XUADz30kCZMmCDp1i/yn376qRo1aqTt27dbfb/kxLm6m6tXr6phw4by8fHR6NGjVa5cObm5uWn79u0aOnSoTdfG7b777jt169ZNrVu31pAhQxQQEKB8+fIpIiLCPBnL7Tp37qy3335bmzdv1uzZs/Xee++lqZMaS+fOnTP8g9HtzxplVkbXu609uE2aNNGQIUMkSf/995/GjRunp556SpGRkVY9d0BeQ5IE5HLPP/+8pk+frk2bNqlu3bo2bZPeLyr//vuvPDw8zH8l/uGHH9S1a1fzL0TSrSE06Q37km79FTj1F+7nn39ee/fuVUREhFWSlC9fvjQTOdzeXrly5STdSgBtnfShQYMGVsN9/Pz8Mqy7dOlSbdq0KcOhKqn7DwgIyNSkE7d74oknVKpUKa1evVrjxo2ze3sXFxc99NBDOnTokC5evJhu4pYZkZGRcnZ2zjARtfTff//p2rVrqlKlyh3rubu7a8aMGdqxY4d8fX0VFhamXbt2afDgwXfcrnDhwvLx8XFIsn/lyhWtWLFC4eHhVrOl3emX80OHDln91f/w4cNKSUlJk0CWKFHC6prw8vJSp06dtGPHjgzbDgsL0/79+/Xxxx9r6NChGjZsmD799FObj+fzzz/X+fPnzTOU2cPT0zPNNbxz506r5dTewYMHD6ps2bLm8qSkJEVFRaXZ3t/f36qsUaNGCgwM1MyZMzV8+HBzeU6cq7tZvXq1Ll26pMWLF6tBgwbm8qioqHTrnz59WnFxcVa9Sf/++6+k//vjwg8//KCyZctq8eLFVklFWFhYhnEULFhQLVu2NA/da9++fZoJWAoXLixvb28lJyc75OeQpbtd70FBQUpJSdGhQ4esvufPnTuXbo9ysWLFrGKsVKmS6tWrp6VLl2Y4vBjIC3gmCcjl3n77bXl6euqVV17RuXPn0qw/cuSIPvnkE6uy25OEkydP6qefflKTJk3MU1Pny5fPaoidJH322WdppsrNyPXr19NMF2uL2rVrq1y5cvr4448VGxubZv2FCxfsbjNVcnKy3nnnHYWGhmaYHISEhMjHx0cffPCB1dCZzO7fZDLp008/VVhYmF5++eUM6x06dCjdGcKuXr2qTZs2yd/f/65Dp2yVlJSkn3/+WU8//XS607LfLnUoTUZDEC0NHz5cJ06c0HfffadnnnnG6tm3jDg5Oal169b65ZdfFBkZmWb97dfhnaRev7dvM3ny5Ay3mTp1qtXyZ599JunWe6juJPWv/bdP555qy5Yt+vjjjzVgwAANGjRIQ4YM0ZQpU7RmzZo7tpvq2rVrGjt2rN566y2HJce3e+aZZ5Q/f359+umnVufs66+/VnR0tJo3b37H7a9fvy5Jd/1ez+pzZYv0ro2kpCR9/vnn6da/efOmvvzyS6u6X375pQoXLmy+rtNrc8uWLdq0adMdY+nRo4d2796tdu3apfs9mC9fPr3wwgv68ccf0/3jwb38HLzb9Z462+Xt3zMTJ06UJIddE8D9jp4kIJcrV66c5s6dq5deeklVqlRRly5dFBwcrKSkJG3cuFGLFi1K81BwcHCwQkJCrKYAl2Q1ROr555/XnDlz5Ovrq6pVq2rTpk3666+/MpxeeunSpSpUqJB5uN26des0YMAAu4/HyclJX331lZo1a6Zq1aqpe/fuKl68uE6dOqVVq1bJx8cnzZA9W/3333/Knz9/mgfyLfn4+GjatGl6+eWXVatWLXXo0EGFCxfWiRMn9Ouvv6p+/fqaMmWKXftt1aqVWrVqdcc6u3btUmhoqJo1a6Ynn3xSBQoU0KlTpzR79mydPn1akydPTvML5u+//57mYXPp1lTYlr0Clnbv3q3w8HD9999/at68udVzTqmJs+VfgMPCwvTVV1+pQ4cOqly58h2P4a+//tKkSZM0Z86cuz6/dLsPPvhA//vf/9SwYUO9+uqrqlKlis6cOaNFixZp/fr1d+wdtOTj46MGDRroo48+0o0bN1S8eHH973//y7C3QLrVk9CyZUs1bdpUmzZt0nfffafQ0FDVqFHDqt6JEyf0xx9/mIeQjR07VkFBQapZs2aanqqEhAR17dpVFSpUME9XHh4erl9++UXdu3fXnj170jzvcrvt27erUKFCevvtt2069swoXLiwhg8frvDwcDVt2lQtW7bUwYMH9fnnn+uRRx5R586dreqfO3fOfM1cvHhRX375pZydnfX8889b1cvuc2WLevXqyd/fX127dlW/fv1kMpk0Z86cDJPwwMBAjRs3TseOHVPFihW1YMEC7dy5U9OnTzdPi/78889r8eLFatOmjZo3b66oqCh98cUXqlq1arp/5EnVtGlTXbhw4Y5/pPjwww+1atUqPfbYY+rVq5eqVq2qy5cva/v27frrr790+fLlTJ2Hu13vNWrUUNeuXTV9+nTzEMW///5bs2fPVuvWrfXUU09ZtXf06FHzNXHq1ClNmTJFPj4+TN6AvC/7J9QDkBn//vuv0atXL6N06dJG/vz5DW9vb6N+/frGZ599ZjW1rf7/9LzfffedUaFCBcPV1dWoWbOmeQruVFeuXDG6d+9uFCpUyPDy8jJCQkKMAwcOpJny9fbpqPPnz2+UL1/eGDlypNV+bZ0CPNWOHTuMtm3bGgULFjRcXV2NoKAgo3379saKFSvMdeydAlyS0b9/f6u66U1HbBi3pq4OCQkxfH19DTc3N6NcuXJGt27drKZOT4+tU17fPm3xuXPnjA8//NBo2LChUaxYMcPZ2dnw9/c3nn76aeOHH35IN+aMPpbHfbvUc3a3z6pVq4wNGzYY5cuXN0aNGpVmmuPbpwC/ePGiERgYaHTs2DFT58Mwbk1F36VLF6Nw4cKGq6urUbZsWaNPnz5p9p16/jKaAvy///4z2rRpY/j5+Rm+vr5Gu3btjNOnTxuSjLCwsDTnYt++fcaLL75oeHt7G/7+/sabb75pNY25YRhW58ZkMhlFixY12rZta+zfv98wjLTX0VtvvWXky5fP2LJli1U7kZGRhrOzs9G7d+87novUKagnTZpkVZ4a893YMy22Ydya8rty5cqGi4uLUaRIEaN3797GlStX0o0p9ePn52fUr1/f+O2336zq5cS5svVYN2zYYDz++OOGu7u7ERgYaLz99tvG8uXL0/wMSm0zMjLSqFu3ruHm5mYEBQUZU6ZMsdpHSkqK8cEHHxhBQUHmn6XLli1LMxV36veL5RTfljJaf+7cOaNPnz5GyZIlDRcXF6No0aJG48aNjenTp6fbji1TgNtyvd+4ccMIDw83ypQpY7i4uBglS5Y0hg8fbvUz3TBuTQFu+f9dqFAho0mTJsamTZvSjQHIS0yGYcc4BwC5nslkUp8+fezuDUHeMGrUKK1evdr8gHx6SpcurVmzZqlRo0bZFldOSH2J6oULF9JMYY0HW6NGjXTx4sUsmRQnp3C9A47FM0kAAAAAYIFnkgAgD3nooYfMz1NkpE2bNipSpEg2RQQAwP2HJAkA8pC2bdvetc6kSZOyIRIAAO5fPJMEAAAAABZ4JgkAAAAALJAkAQAAAICFPP9MUkpKik6fPi1vb2+ZTKacDgcAAABADjEMQ9euXVNgYKCcnDLuL8rzSdLp06dVsmTJnA4DAAAAQC5x8uRJlShRIsP1eT5J8vb2lnTrRPj4+ORwNAAAAABySkxMjEqWLGnOETKS55Ok1CF2Pj4+JEkAAAAA7voYDhM3AAAAAIAFkiQAAAAAsJCjSdLatWvVokULBQYGymQyaenSpRnWff3112UymTR58uRsiw8AAADAgydHk6S4uDjVqFFDU6dOvWO9JUuWaPPmzQoMDMymyAAAAAA8qHJ04oZmzZqpWbNmd6xz6tQp9e3bV8uXL1fz5s3v2mZiYqISExPNyzExMfccJwAAAIAHR65+JiklJUUvv/yyhgwZomrVqtm0TUREhHx9fc0f3pEEAAAAwB65OkkaN26cnJ2d1a9fP5u3GT58uKKjo82fkydPZmGEAAAAAPKaXPuepG3btumTTz7R9u3b7zqPuSVXV1e5urpmYWQAAAAA8rJc25O0bt06nT9/XqVKlZKzs7OcnZ11/PhxDRo0SKVLl87p8AAAAADkUbm2J+nll1/WM888Y1UWEhKil19+Wd27d8+hqAAAAADkdTmaJMXGxurw4cPm5aioKO3cuVMFChRQqVKlVLBgQav6Li4uKlq0qCpVqpTdoQIAAAB4QORokhQZGamnnnrKvDxw4EBJUteuXTVr1qwcigoAAADAgyxHk6RGjRrJMAyb6x87dizrggEAAAAA5eKJGwAAAAAgJ5AkAQAAAICFXDu7HQAAD5S5tr8TEADuO6G2P2KTG9CTBAAAAAAWSJIAAAAAwAJJEgAAAABYIEkCAAAAAAskSQAAAABggSQJAAAAACyQJAEAAACABZIkAAAAALBAkgQAAAAAFkiSAAAAAMACSRIAAAAAWCBJAgAAAAALJEkAAAAAYIEkCQAAAAAskCQBAAAAgAWSJAAAAACwQJIEAAAAABZIkgAAAADAAkkSAAAAAFggSQIAAAAACyRJAAAAAGCBJAkAAAAALJAkAQAAAIAFkiQAAAAAsECSBAAAAAAWSJIAAAAAwAJJEgAAAABYIEkCAAAAAAskSQAAAABggSQJAAAAACyQJAEAAACABZIkAAAAALBAkgQAAAAAFkiSAAAAAMACSRIAAAAAWCBJAgAAAAALzvZUvnr1qpYsWaJ169bp+PHjio+PV+HChVWzZk2FhISoXr16WRUnAAAAAGQLm3qSTp8+rVdeeUXFihXT+++/r+vXr+vhhx9W48aNVaJECa1atUrPPvusqlatqgULFmR1zAAAAACQZWzqSapZs6a6du2qbdu2qWrVqunWuX79upYuXarJkyfr5MmTGjx4sEMDBQAAAIDsYFNP0r59+/TRRx9lmCBJkru7uzp27KhNmzape/fuNu187dq1atGihQIDA2UymbR06VLzuhs3bmjo0KGqXr26PD09FRgYqC5duuj06dM2tQ0AAAAAmWFTklSwYEG7GrW1flxcnGrUqKGpU6emWRcfH6/t27drxIgR2r59uxYvXqyDBw+qZcuWdsUCAAAAAPaweXa7N954Q7GxseblefPmKS4uzrx89epVPffcc3btvFmzZnr//ffVpk2bNOt8fX31559/qn379qpUqZIef/xxTZkyRdu2bdOJEyfs2g8AAAAA2MrmJOnLL79UfHy8efm1117TuXPnzMuJiYlavny5Y6O7TXR0tEwmk/z8/DKsk5iYqJiYGKsPAAAAANjK5iTJMIw7Lme1hIQEDR06VB07dpSPj0+G9SIiIuTr62v+lCxZMhujBAAAAHC/uy9eJnvjxg21b99ehmFo2rRpd6w7fPhwRUdHmz8nT57MpigBAAAA5AV2vUw2J6QmSMePH9fKlSvv2IskSa6urnJ1dc2m6AAAAADkNXYlSSNHjpSHh4ckKSkpSWPHjpWvr68kWT2v5CipCdKhQ4e0atUqu2fZAwAAAAB72ZwkNWjQQAcPHjQv16tXT0ePHk1Txx6xsbE6fPiweTkqKko7d+5UgQIFVKxYMb344ovavn27li1bpuTkZJ09e1aSVKBAAeXPn9+ufQEAAACALUxGds/AYGH16tV66qmn0pR37dpVo0aNUpkyZdLdbtWqVWrUqJFN+4iJiZGvr6+io6PvOlQPAIAcM9eU0xEAQNYJzbGUw4qtucE9P5N08+ZNJSQkyMvLy+5tGzVqdMdZ8nIwfwMAAADwgLJ5drtffvlFs2bNsiobO3asvLy85OfnpyZNmujKlSuOjg8AAAAAspXNSdLEiRMVFxdnXt64caNGjhypESNGaOHChTp58qTGjBmTJUECAAAAQHaxOUnau3ev6tWrZ17+4Ycf9Oyzz+rdd99V27ZtNWHCBP3yyy9ZEiQAAAAAZBebk6Rr165ZTcG9fv16NW7c2LxcrVo1nT592rHRAQAAAEA2szlJKl68uPbv3y/p1tTdu3btsupZunTpkvkdSgAAAABwv7I5SWrXrp0GDBigOXPmqFevXipatKgef/xx8/rIyEhVqlQpS4IEAAAAgOxi8xTgI0eO1KlTp9SvXz8VLVpU3333nfLly2deP2/ePLVo0SJLggQAAACA7GJzkuTu7q5vv/02w/WrVq1ySEAAAAAAkJNsHm4HAAAAAA8Cm3uSnn76aZvqrVy5MtPBAAAAAEBOszlJWr16tYKCgtS8eXO5uLhkZUwAAAAAkGNsTpLGjRunmTNnatGiRerUqZN69Oih4ODgrIwNAAAAALKdzc8kDRkyRPv27dPSpUt17do11a9fX48++qi++OILxcTEZGWMAAAAAJBt7J64oW7dupoxY4bOnDmjPn366JtvvlFgYCCJEgAAAIA8IdOz223fvl1r1qzR/v37FRwczHNKAAAAAPIEu5Kk06dP64MPPlDFihX14osvqkCBAtqyZYs2b94sd3f3rIoRAAAAALKNzRM3PPfcc1q1apWaNGmi8ePHq3nz5nJ2tnlzAAAAALgvmAzDMGyp6OTkpGLFiikgIEAmkynDetu3b3dYcI4QExMjX19fRUdHy8fHJ6fDAQAgfXMzvrcCwH0v1KaUI8vZmhvY3BUUFhbmkMAAAAAAIDcjSQIAAAAAC5me3Q4AAAAA8iKbkqSmTZtq8+bNd6137do1jRs3TlOnTr3nwAAAAAAgJ9g03K5du3Z64YUX5OvrqxYtWqhOnToKDAyUm5ubrly5on379mn9+vX67bff1Lx5c40fPz6r4wYAAACALGHz7HaJiYlatGiRFixYoPXr1ys6OvpWAyaTqlatqpCQEPXs2VNVqlTJ0oDtxex2AID7ArPbAcjL7rPZ7WxOkm4XHR2t69evq2DBgnJxccl0oFmNJAkAcF8gSQKQl91nSVKm3wbr6+srX1/fzG4OAAAAALkSs9sBAAAAgAWSJAAAAACwQJIEAAAAABZIkgAAAADAAkkSAAAAAFiwe3a75ORkTZo0SQsXLtSJEyeUlJRktf7y5csOCw4AAAAAspvdPUnh4eGaOHGiXnrpJUVHR2vgwIFq27atnJycNGrUqCwIEQAAAACyj91J0vfff68ZM2Zo0KBBcnZ2VseOHfXVV19p5MiR2rx5c1bECAAAAADZxu4k6ezZs6pevbokycvLS9HR0ZKk559/Xr/++qtjowMAAACAbGZ3klSiRAmdOXNGklSuXDn973//kyRt3bpVrq6ujo0OAAAAALKZ3UlSmzZttGLFCklS3759NWLECFWoUEFdunRRjx49HB4gAAAAAGQnk2EYxr00sHnzZm3cuFEVKlRQixYtHBWXw8TExMjX11fR0dHy8fHJ6XAAAEjfXFNORwAAWSf0nlIOh7E1N7B7CvDbPf7443r88cfvtRkAAAAAyBXsHm63fPnydMuPHDmihg0b3nNAAAAAAJCT7E6SXnzxRf3www9WZZ988olq1KihihUrOiwwAAAAAMgJdg+3W7hwoflFsg0bNlT37t114sQJ/fDDD2ratGlWxAgAAAAA2cbuJKlZs2b69ddf1bJlSyUmJqpTp0769ddfmRQBAAAAQJ5g93A7SXryySe1cuVKeXl5KSAgINMJ0tq1a9WiRQsFBgbKZDJp6dKlVusNw9DIkSNVrFgxubu765lnntGhQ4cytS8AAAAAsIXdPUlt27Y1fx0YGKgPP/xQGzdulL+/vyRp8eLFNrcVFxenGjVqqEePHlbtpvroo4/06aefavbs2SpTpoxGjBihkJAQ7du3T25ubvaGDgAAAAB3ZXeS5Ovra/66Zs2aqlmzZqZ33qxZMzVr1izddYZhaPLkyXrvvffUqlUrSdK3336rIkWKaOnSperQoUOm9wsAAAAAGbE7SZo5c2ZWxJFGVFSUzp49q2eeecZc5uvrq8cee0ybNm3KMElKTExUYmKieTkmJibLYwUAAACQd2T6ZbJHjx7Vvn37ZDKZVKVKFZUtW9aRcens2bOSpCJFiliVFylSxLwuPREREQoPD3doLAAAAAAeHHeduOHmzZsKDQ1VbGyspFs9M+3atVP58uXVpk0btW7dWhUqVFD79u117dq1LA/4boYPH67o6Gjz5+TJkzkdEgAAAID7yF2TJGdnZ/300086f/68JKl///76559/tG7dOiUkJCghIUFr1qzRP//8o7feesthgRUtWlSSdO7cOavyc+fOmdelx9XVVT4+PlYfAAAAALCVTVOAFypUSDdv3pQk/fzzz5oxY4bq16+vfPnyKV++fHriiSf05ZdfppnC+16UKVNGRYsW1YoVK8xlMTEx2rJli+rWreuw/QAAAACAJZueSSpfvry2bdumihUrKiUlRQUKFEhTx9/fX/Hx8XbtPDY2VocPHzYvR0VFaefOnSpQoIBKlSqlAQMG6P3331eFChXMU4AHBgaqdevWdu0HAAAAAGxlU09Sp06d9M477+jcuXOqX7++Ro0apYSEBPP669evKzw8XI8//rhdO4+MjLSaRnzgwIGqWbOmRo4cKUl6++231bdvX7366qt65JFHFBsbqz/++IN3JAEAAADIMibDMAxbKr722mv66aefFBwcrJUrV6pAgQKqUaOGJGnXrl1ycXHR77//rocffjgr47VbTEyMfH19FR0dzfNJAIDca64ppyMAgKwTalPKkeVszQ1sTpIkad26dfr111914cIFpaSkSLo1zK5y5coKDQ2Vl5fXvUfuYCRJAID7AkkSgLzsPkuS7HpP0pNPPqknn3zynoMDAAAAgNzK7pfJ7t69+47rH3rooUwHAwAAAAA5ze4k6eGHH5bJdGtIwO0j9Uwmk5KTkx0TGQAAAADkALuTpCeeeEI7d+7UsGHDFBoaak6YAAAAACAvsGkKcEtr167VrFmzNGvWLLVv317//fefgoKCzB8AAAAAuJ/ZnSRJUtu2bbVv3z6FhoaqVatWatu2rdVLYQEAAADgfpWpJEmSnJ2dNWDAAB0+fFhlypRRrVq1NGDAAAeGBgAAAADZz673JEm33ouU3nNIcXFxunnzZq6buIH3JAEA7gu8JwlAXpaX35MkSZMnT76XuAAAAAAgV7M7SeratWtWxAEAAAAAuYLdSVJMTMwd1zOkDQAAAMD9zO4kyc/PL91nkgzDkJOTk27evOmQwAAAAAAgJ9idJK1atSrd8sTERDVr1uyeAwIAAACAnGR3ktSwYcN0yxMTE+85GAAAAADIaZl+TxIAAAAA5EV29yT16NEj3fLc9n4kAAAAAMgMu5OkK1eupFuekpJyz8EAAAAAQE6zO0lasmRJuuUJCQny9PS854AAAAAAICc57Jmk9KYFBwAAAID7jd09Sbt37063nNntAAAAAOQFdidJDz/8sEwmkwzDMJelLtObBAAAAOB+Z3eSFBUVlRVxAAAAAECuYHeSFBQUlBVxAAAAAECuYHeSJElHjhzR5MmTtX//fklS1apV1b9/f5UrV86hwQEAAABAdrN7drvly5eratWq+vvvv/XQQw/poYce0pYtW1StWjX9+eefWREjAAAAAGQbk2E5A4MNatasqZCQEH344YdW5cOGDdP//vc/bd++3aEB3quYmBj5+voqOjpaPj4+OR0OAADpm8vkRwDysFC7Uo4sY2tuYHdP0v79+9WzZ8805T169NC+ffvsbQ4AAAAAchW7k6TChQtr586dacp37typgIAAR8QEAAAAADnG7okbevXqpVdffVVHjx5VvXr1JEkbNmzQuHHjNHDgQIcHCAAAAADZye4kacSIEfL29taECRM0fPhwSVJgYKBGjRqlfv36OTxAAAAAAMhOdk/cYOnatWuSJG9vb4cF5GhM3AAAuC8wcQOAvOw+m7ghU+9JSpWbkyMAAAAAyAy7J24AAAAAgLyMJAkAAAAALJAkAQAAAIAFkiQAAAAAsGD3xA13exfSxIkTMx0MAAAAAOQ0u5OkyZMnq27dusqfP78kaf369apdu7bc3d1lMjF9KQAAAID7W6amAF+yZIkCAgIk3ZoGfO7cuSpbtqxDAwMAAACAnGD3M0kuLi5KSkoyL9+4cUM//vijQ4MCAAAAgJxid5JUpkwZzZ8/X5L0448/ysXFRTNmzFDHjh0VHx/v8AABAAAAIDvZnSQNHTpUw4YNk5ubm9q3b69hw4YpMjJS8fHxevTRR7MiRgAAAADINnYnSd27d9fevXs1Z84cbdmyRe+++658fHz0008/qXPnzg4NLjk5WSNGjFCZMmXk7u6ucuXKacyYMTIMw6H7AQAAAIBUmZq4oVKlSqpUqVKa8mHDht1zQJbGjRunadOmafbs2apWrZoiIyPVvXt3+fr6ql+/fg7dFwAAAABImUySUiUkJFhN4iBJPj4+9xSQpY0bN6pVq1Zq3ry5JKl06dKaN2+e/v77b4ftAwAAAAAs2T3cLj4+Xm+++aYCAgLk6ekpf39/q48j1atXTytWrNC///4rSdq1a5fWr1+vZs2aZbhNYmKiYmJirD4AAAAAYCu7k6QhQ4Zo5cqVmjZtmlxdXfXVV18pPDxcgYGB+vbbbx0a3LBhw9ShQwdVrlxZLi4uqlmzpgYMGKBOnTpluE1ERIR8fX3Nn5IlSzo0JgAAAAB5m8mwcxaEUqVK6dtvv1WjRo3k4+Oj7du3q3z58pozZ47mzZun3377zWHBzZ8/X0OGDNH48eNVrVo17dy5UwMGDNDEiRPVtWvXdLdJTExUYmKieTkmJkYlS5ZUdHS0Q4cCAgDgUHNNOR0BAGSd0Nwx8VpMTIx8fX3vmhvY/UzS5cuXVbZsWUm3nj+6fPmyJOmJJ55Q7969Mxlu+oYMGWLuTZKk6tWr6/jx44qIiMgwSXJ1dZWrq6tD4wAAAADw4LB7uF3ZsmUVFRUlSapcubIWLlwoSfrll1/k5+fn0ODi4+Pl5GQdYr58+ZSSkuLQ/QAAAABAKrt7krp3765du3apYcOGGjZsmFq0aKEpU6boxo0bmjhxokODa9GihcaOHatSpUqpWrVq2rFjhyZOnKgePXo4dD8AAAAAkMruZ5Jud/z4cW3btk3ly5fXQw895Ki4JEnXrl3TiBEjtGTJEp0/f16BgYHq2LGjRo4cqfz589vUhq3jDgEAyFE8kwQgL7vPnkm65yTJUkJCgtzc3BzVnEOQJAEA7gskSQDysvssSbL7maRvvvkm3fINGzaoRo0a9jYHAAAAALmK3UnSoEGDNGnSJPNyQkKCBgwYoGeffVYvv/yyQ4MDAAAAgOxm98QNK1asUNOmTXXlyhU1adJE3bt3l6+vr7Zs2aLq1atnRYwAAAAAkG3s7kmqVauW1q5dq1mzZqlhw4bq0qULCRIAAACAPMPuJEm69X6k9evXq1y5cjp8+HCadxkBAAAAwP3K7uF2NWvWlMl0awaeGzduaM6cOdq4caO8vb0lSdu3b3dshAAAAACQjexOklq3bp0FYQAAAABA7mB3khQWFpYVcQAAAABArmD3w0Rbt27Vli1b0pRv2bJFkZGRDgkKAAAAAHKK3UlSnz59dPLkyTTlp06dUp8+fRwSFAAAAADkFLuTpH379qlWrVppymvWrKl9+/Y5JCgAAAAAyCl2J0murq46d+5cmvIzZ87I2dnuR5wAAAAAIFexO0lq0qSJhg8frujoaHPZ1atX9c477+jZZ591aHAAAAAAkN3s7vr5+OOP1aBBAwUFBalmzZqSpJ07d6pIkSKaM2eOwwMEAAAAgOxkd5JUvHhx7d69W99//7127dold3d3de/eXR07dpSLi0tWxAgAAAAA2SZTDxF5enrq1VdfdXQsAAAAAJDj7H4mSZLmzJmjJ554QoGBgTp+/LgkadKkSfrpp58cGhwAAAAAZDe7k6Rp06Zp4MCBatasma5cuaLk5GRJkr+/vyZPnuzo+AAAAAAgW9mdJH322WeaMWOG3n33Xaspv+vUqaM9e/Y4NDgAAAAAyG52J0lRUVHmWe0subq6Ki4uziFBAQAAAEBOsTtJKlOmjHbu3Jmm/I8//lCVKlUcERMAAAAA5Bi7Z7cbOHCg+vTpo4SEBBmGob///lvz5s1TRESEvvrqq6yIEQAAAACyjd1J0iuvvCJ3d3e99957io+PV2hoqAIDA/XJJ5+oQ4cOWREjAAAAAGQbk2EYRmY3jo+PV2xsrAICAhwZk0PFxMTI19dX0dHR8vHxyelwAABI31xTTkcAAFknNNMph0PZmhtk6mWyqTw8POTh4XEvTQAAAABArmJ3klSzZk2ZTBn/tWv79u33FBAAAAAA5CS7k6TWrVubvzYMQxEREXr99ddVoEABR8YFAAAAADninp5JkiRvb2/t2rVLZcuWdVRMDsUzSQCA+wLPJAHIy+6zZ5Lsfk+SJcMwdPPmTTk53VMzAAAAAJBr2D3cbvfu3ZKk69eva8GCBXJxcVGJEiUcHhgAAAAA5AS7k6SHH35YJpNJhmGocOHCmj17tpyd72mSPAAAAADINezObqKioiRJ7u7uufr9SAAAAACQGXYnSUFBQVkRBwAAAADkCnYnSQMHDrzj+okTJ2Y6GAAAAADIaXYnSZMnT5a3t7dq166t22cPv9NLZgEAAADgfmB3kjRjxgyNHDlSzs7OmjBhgqpXr54VcQEAAABAjrD7BUc9e/bUoUOHVLduXdWvX1+9evXSuXPnsiI2AAAAAMh2mXoLrIeHh8LDw3Xw4EElJyerYsWKGj16tK5fv+7o+AAAAAAgW9k93O7nn3+2Wm7durWCgoI0fvx4TZ8+Xf/995/DggMAAACA7GZ3ktS6desM18XFxd1LLAAAAACQ4+xOklJSUrIiDgAAAADIFex+Junbb79VYmJiVsQCAAAAADnO7iSpe/fuio6OzopYAAAAACDH2Z0k3f4C2ax26tQpde7cWQULFpS7u7uqV6+uyMjIbI0BAAAAwIPD7meSJGnhwoXy8fFJd12XLl3uKSBLV65cUf369fXUU0/p999/V+HChXXo0CH5+/s7bB8AAAAAYMlk2Nk15OTkpBIlSihfvnxpGzOZdPToUYcFN2zYMG3YsEHr1q2zeZvExESrZ6ZiYmJUsmRJRUdHZ5jYAQCQ4+aacjoCAMg6odk7Gi0jMTEx8vX1vWtukKmXyUZGRioqKirNx5EJknTrnUx16tRRu3btFBAQoJo1a2rGjBl33CYiIkK+vr7mT8mSJR0aEwAAAIC8LVNJUnY5evSopk2bpgoVKmj58uXq3bu3+vXrp9mzZ2e4zfDhwxUdHW3+nDx5MhsjBgAAAHC/s/uZpKCgoHSH2mWFlJQU1alTRx988IEkqWbNmvrnn3/0xRdfqGvXrulu4+rqKldX12yJDwAAAEDeY3dPUlRUlAoWLJgVsaRRrFgxVa1a1aqsSpUqOnHiRLbsHwAAAMCDJ1Oz28XFxWnNmjU6ceKEkpKSrNb169fPIYFJUv369XXw4EGrsn///VdBQUEO2wcAAAAAWLI7SdqxY4eee+45xcfHKy4uTgUKFNDFixfl4eGhgIAAhyZJb731lurVq6cPPvhA7du3199//63p06dr+vTpDtsHAAAAAFiye7jdW2+9pRYtWujKlStyd3fX5s2bdfz4cdWuXVsff/yxQ4N75JFHtGTJEs2bN0/BwcEaM2aMJk+erE6dOjl0PwAAAACQyu73JPn5+WnLli2qVKmS/Pz8tGnTJlWpUkVbtmxR165ddeDAgayKNVNsnQs9u5h4DQaAPMy+Owqs8J4kAHlZXn9PkouLi5ycbm0WEBBgnkTB19eX6bYBAAAA3PfsfiapZs2a2rp1qypUqKCGDRtq5MiRunjxoubMmaPg4OCsiBEAAAAAso3dPUkffPCBihUrJkkaO3as/P391bt3b124cIEJFQAAAADc9+zuSapTp47564CAAP3xxx8ODQgAAAAAclKm3pMkSefPnze/w6hy5coqXLiww4ICAAAAgJxi93C7a9eu6eWXX1bx4sXVsGFDNWzYUIGBgercubOio6OzIkYAAAAAyDZ2J0mvvPKKtmzZomXLlunq1au6evWqli1bpsjISL322mtZESMAAAAAZBu7h9stW7ZMy5cv1xNPPGEuCwkJ0YwZM9S0aVOHBgcAAAAA2c3unqSCBQvK19c3Tbmvr6/8/f0dEhQAAAAA5BS7k6T33ntPAwcO1NmzZ81lZ8+e1ZAhQzRixAiHBgcAAAAA2c3u4XbTpk3T4cOHVapUKZUqVUqSdOLECbm6uurChQv68ssvzXW3b9/uuEgBAAAAIBvYnSS1bt06C8IAAAAAgNzB7iQpLCwsK+IAAAAAgFwh0y+TvV18fLw+/vhjSZKXl5cGDhzoqKYBAAAAINvYnSRllPzEx8drxowZmjhxojw9Pe85MAAAAADICXYnSZMnT1bdunWVP39+q/KkpCRJUv/+/R0TGQAAAADkgEwNt1uyZIkCAgKsys6ePavixYs7JCgAAAAAyCl2vyfJZDLJZDKlWw4AAAAA9zu7e5IMw1C3bt3k5eUlHx8flSlTRg0aNFD58uWzIj4AAAAAyFZ2J0ldu3aVJCUmJurEiRNavXq1RowYodKlSzs6NgAAAADIdnYnSTNnzkxT9t9//2no0KE6duyYvv32W7m7u6tdu3YOCRAAAAAAspND3pNUokQJTZ06Vfnz59eqVavk5+dHkgQAAADgvuSwl8n6+fml28sEAAAAAPeTTCdJ+/bt04kTJ8zvR5JuzXDXokULhwQGAAAAADnB7iTp6NGjatOmjfbs2SOTySTDMCT93xTgycnJjo0QAAAAALKR3e9J6t+/v8qUKaPz58/Lw8NDe/fu1dq1a1WnTh2tXr06C0IEAAAAgOxjd0/Spk2btHLlShUqVEhOTk5ycnLSE088oYiICPXr1087duzIijgBAAAAIFvY3ZOUnJwsb29vSVKhQoV0+vRpSVJQUJAOHjzo2OgAAAAAIJvZ3ZMUHBysXbt2qUyZMnrsscf00UcfKX/+/Jo+fbrKli2bFTECAAAAQLaxO0l67733FBcXJ0kaPXq0nn/+eT355JMqWLCgFixY4PAAAQAAACA72Z0khYSEmL8uX768Dhw4oMuXL8vf3988wx0AAAAA3K/sfibpdoZh6OrVq1bvSwIAAACA+5XdSdK2bdtUt25dNWvWTEeOHFHt2rVVvnx5FSlSRGvWrMmKGAEAAAAg29idJPXr10/e3t7y8fHRs88+q+rVq2vPnj3q0KGDhg4dmhUxAgAAAEC2sfuZpF27dmnbtm0KCgqSl5eXBg8erGrVquntt9/WQw89lBUxAgAAAEC2sbsnKT4+XgUKFJCbm5vc3d3l6ekpSfL09NT169cdHiAAAAAAZCe7e5IkacaMGfLy8tLNmzc1a9YsFSpUSNeuXXN0bAAAAACQ7UyGYRj2bFC6dOk7TvUdFRV1z0E5UkxMjHx9fRUdHS0fH5+cDkfMkg4gL7PvjgIrc7lBAMjDQnPHDcLW3MDunqRjx47dS1wAAAAAkKvd83uSLJ0/f96RzQEAAABAtrM7SRo5cmS65d9//72qVat2zwEBAAAAQE6ye7jdrFmzFB0drU8++UTSrd6jV199VevXr9fkyZMdHR8AAAAAZCu7e5LWrVun33//XV27dtV3332nKlWqyDAM/fPPP+rcuXNWxGj24YcfymQyacCAAVm6HwAAAAAPLrt7koKCgrR27Vo1adJE3333nb788ku98sorWRGbla1bt+rLL7/khbUAAAAAslSmJm4oWrSo1q5dq8cee0wLFizI8pfIxsbGqlOnTpoxY4b8/f2zdF8AAAAAHmx29yT5+/ub35N048YNxcXFKSAgQC4uLpKky5cvOzZCSX369FHz5s31zDPP6P33379j3cTERCUmJpqXY2JiHB4PAAAAgLzL7iQpuydnmD9/vrZv366tW7faVD8iIkLh4eFZHBUAAACAvMruJKlr165ZEUe6Tp48qf79++vPP/+Um5ubTdsMHz5cAwcONC/HxMSoZMmSWRUiAAAAgDwmU88kHTlyRO+99546duxofoHs77//rr179zo0uG3btun8+fOqVauWnJ2d5ezsrDVr1ujTTz+Vs7OzkpOT02zj6uoqHx8fqw8AAAAA2MruJGnNmjWqXr26tmzZosWLFys2NlaStGvXLoWFhTk0uMaNG2vPnj3auXOn+VOnTh116tRJO3fuVL58+Ry6PwAAAACwe7jdsGHD9P7772vgwIHy9vY2lz/99NOaMmWKQ4Pz9vZWcHCwVZmnp6cKFiyYphwAAAAAHMHunqQ9e/aoTZs2acoDAgJ08eJFhwQFAAAAADnF7p4kPz8/nTlzRmXKlLEq37Fjh4oXL+6wwDKyevXqLN8HAAAAgAeX3T1JHTp00NChQ3X27FmZTCalpKRow4YNGjx4sLp06ZIVMQIAAABAtrE7Sfrggw9UuXJllSxZUrGxsapataoaNGigevXq6b333suKGAEAAAAg25gMwzAys+GJEyf0zz//KDY2VjVr1lSFChUcHZtDxMTEyNfXV9HR0bliOnCTKacjAICsk7k7CiRJc7lBAMjDQnPHDcLW3MDuZ5JSlSpVSqVKlcrs5gAAAACQK9mdJA0cOPCO6ydOnJjpYAAAAAAgp9mdJO3YscP89fr161W7dm25u7tLkkyMJQMAAABwn7M7SVq1apX5a29vb82dO1dly5Z1aFAAAAAAkFPsnt0OAAAAAPIykiQAAAAAsGD3cLuff/7Z/HVKSopWrFihf/75x1zWsmVLx0QGAAAAADnA7vckOTll3PlkMpmUnJx8z0E5Eu9JAoDsw3uS7gHvSQKQl+X19ySlpKTcU2AAAAAAkJvxTBIAAAAAWLC7JykmJibd8vPnz6tSpUry9fVVkSJFtH///nsODgAAAACym91Jkp+fX7ovjTUMQyaTSZcvX3ZIYAAAAACQE+xOkiTphx9+UIECBazKLl26pHbt2jkkKAAAAADIKZlKkurXr6+AgACrsnPnzjkkIAAAAADISZlKkvbt26dLly7Jx8dHgYGB6Q6/AwAAAID7UaaSpMaNG5u/zp8/v+rVq6e2bds6LCgAAAAAyCl2J0lRUVGSpMTERF26dElHjx7VmjVrNHToUIcHBwAAAADZzWQYjnk/+tq1a9WoUSOVLl1ahQsX1pYtWxzR7D2z9a262YWRiQDyMsfcUR5Qc7lBAMjDQnPHDcLW3CBTw+3S88QTT5h7mfLly+eoZgEAAAAgW2UqSbp586ZWr16tI0eOKDQ0VN7e3jp79qwKFiwoLy8vR8cIAAAAANnG7iTp+PHjatq0qU6cOKHExEQ9++yz8vb21rhx45SYmKgvvvgiK+IEAAAAgGzhZO8G/fv3V506dXTlyhW5u7uby9u0aaMVK1Y4NDgAAAAAyG529yStW7dOGzduVP78+a3KS5curVOnTjksMAAAAADICXb3JKWkpCg5OTlN+X///Sdvb2+HBAUAAAAAOcXuJKlJkyaaPHmyedlkMik2NlZhYWF67rnnHBkbAAAAAGQ7u4fbTZgwQSEhIapataoSEhIUGhqqQ4cOqVChQpo3b15WxAgAAAAA2cbuJKlEiRLatWuX5s+fr927dys2NlY9e/ZUp06drCZyAAAAAID7Uabek+Ts7KzOnTs7OhYAAAAAyHGZSpIOHjyozz77TPv375ckValSRW+++aYqV67s0OAAAAAAILvZPXHDjz/+qODgYG3btk01atRQjRo1tH37dlWvXl0//vhjVsQIAAAAANnGZBiGYc8G5cqVU6dOnTR69Gir8rCwMH333Xc6cuSIQwO8VzExMfL19VV0dLR8fHxyOhyZTDkdAQBkHfvuKLAylxsEgDwsNHfcIGzNDezuSTpz5oy6dOmSprxz5846c+aMvc0BAAAAQK5id5LUqFEjrVu3Lk35+vXr9eSTTzokKAAAAADIKXZP3NCyZUsNHTpU27Zt0+OPPy5J2rx5sxYtWqTw8HD9/PPPVnUBAAAA4H5i9zNJTk62dT6ZTCYlJydnKihH4pkkAMg+PJN0D3gmCUBedp89k2R3T1JKSso9BQYAAAAAuZndzyQBAAAAQF5mc5K0cuVKVa1aVTExMWnWRUdHq1q1alq7dq1DgwMAAACA7GZzkjR58mT16tUr3bF7vr6+eu211zRp0iSHBgcAAAAA2c3mJGnXrl1q2rRphuubNGmibdu2OSQoAAAAAMgpNidJ586dk4uLS4brnZ2ddeHCBYcEZSkiIkKPPPKIvL29FRAQoNatW+vgwYMO3w8AAAAASHYkScWLF9c///yT4frdu3erWLFiDgnK0po1a9SnTx9t3rxZf/75p27cuKEmTZooLi7O4fsCAAAAAJunAH/uuec0YsQINW3aVG5ublbrrl+/rrCwMD3//PMOD/CPP/6wWp41a5YCAgK0bds2NWjQwOH7AwAAAPBgszlJeu+997R48WJVrFhRb775pipVqiRJOnDggKZOnark5GS9++67WRZoqujoaElSgQIF0l2fmJioxMRE83J6s/EBAAAAQEZsTpKKFCmijRs3qnfv3ho+fLiM//9adZPJpJCQEE2dOlVFihTJskClWy+yHTBggOrXr6/g4OB060RERCg8PDxL4wAAAACQd5mM1GzHDleuXNHhw4dlGIYqVKggf3//rIgtjd69e+v333/X+vXrVaJEiXTrpNeTVLJkSUVHR6c7fXl2M5lyOgIAyDr231FgNpcbBIA8LDR33CBiYmLk6+t719zA5p4kS/7+/nrkkUcyHVxmvPnmm1q2bJnWrl2bYYIkSa6urnJ1dc3GyAAAAADkJZlKkrKTYRjq27evlixZotWrV6tMmTI5HRIAAACAPCzXJ0l9+vTR3Llz9dNPP8nb21tnz56VJPn6+srd3T2HowMAAACQ19j8nqScMm3aNEVHR6tRo0YqVqyY+bNgwYKcDg0AAABAHpTre5IyMa8EAAAAAGRaru9JAgAAAIDsRJIEAAAAABZIkgAAAADAAkkSAAAAAFggSQIAAAAACyRJAAAAAGCBJAkAAAAALJAkAQAAAIAFkiQAAAAAsECSBAAAAAAWSJIAAAAAwAJJEgAAAABYIEkCAAAAAAskSQAAAABggSQJAAAAACyQJAEAAACABZIkAAAAALBAkgQAAAAAFkiSAAAAAMACSRIAAAAAWCBJAgAAAAALJEkAAAAAYIEkCQAAAAAskCQBAAAAgAWSJAAAAACwQJIEAAAAABZIkgAAAADAAkkSAAAAAFggSQIAAAAACyRJAAAAAGCBJAkAAAAALJAkAQAAAIAFkiQAAAAAsECSBAAAAAAWSJIAAAAAwAJJEgAAAABYIEkCAAAAAAskSQAAAABggSQJAAAAACyQJAEAAACABZIkAAAAALBAkgQAAAAAFkiSAAAAAMDCfZEkTZ06VaVLl5abm5see+wx/f333zkdEgAAAIA8KtcnSQsWLNDAgQMVFham7du3q0aNGgoJCdH58+dzOjQAAAAAeVCuT5ImTpyoXr16qXv37qpataq++OILeXh46Jtvvsnp0AAAAADkQc45HcCdJCUladu2bRo+fLi5zMnJSc8884w2bdqU7jaJiYlKTEw0L0dHR0uSYmJisjZYAID4UXsP4nM6AADIQrnkBpGaExiGccd6uTpJunjxopKTk1WkSBGr8iJFiujAgQPpbhMREaHw8PA05SVLlsySGAEA/8fXN6cjAADkSr1y1w3i2rVr8r3DTStXJ0mZMXz4cA0cONC8nJKSosuXL6tgwYIymUw5GBmQ/WJiYlSyZEmdPHlSPj4+OR0OACAX4N6AB5lhGLp27ZoCAwPvWC9XJ0mFChVSvnz5dO7cOavyc+fOqWjRoulu4+rqKldXV6syPz+/rAoRuC/4+PhwIwQAWOHegAfVnXqQUuXqiRvy58+v2rVra8WKFeaylJQUrVixQnXr1s3ByAAAAADkVbm6J0mSBg4cqK5du6pOnTp69NFHNXnyZMXFxal79+45HRoAAACAPCjXJ0kvvfSSLly4oJEjR+rs2bN6+OGH9ccff6SZzAFAWq6urgoLC0szBBUA8ODi3gDcncm42/x3AAAAAPAAydXPJAEAAABAdiNJAgAAAAALJEkAAAAAYIEkCQAAAAAskCQhTzl79qz69u2rsmXLytXVVSVLllSLFi2s3rUFAHiwdevWTa1bt05Tvnr1aplMJl29ejXbYwKQu+T6KcABWx07dkz169eXn5+fxo8fr+rVq+vGjRtavny5+vTpowMHDuR0iAAAALgP0JOEPOONN96QyWTS33//rRdeeEEVK1ZUtWrVNHDgQG3evFmSVLp0aZlMpnQ/s2bNkiRNnDhR1atXl6enp0qWLKk33nhDsbGx5v3MmjVLfn5+Wrp0qSpUqCA3NzeFhITo5MmT5jqjRo3Sww8/nG6cS5culclksir76aefVKtWLbm5uals2bIKDw/XzZs3MzzWbt26pXsMfn5+5jpHjhxRq1atVKRIEXl5eemRRx7RX3/9ZdVO6dKlNWbMGHXs2FGenp4qXry4pk6dalXHlvNhMpnUsmVLq+0++eQTmUwmdevWzVyWmJiowYMHq3jx4vL09NRjjz2m1atXS/q/v+Bm9Mmqcw8Ad/Ljjz+qWrVqcnV1VenSpTVhwgSr9Zb3FU9PT9WrV0+RkZHm9Y0aNdKAAQPSbXvAgAFq1KiReTklJUUREREqU6aM3N3dVaNGDf3www93jC+j+5plT9kff/yhJ554Qn5+fipYsKCef/55HTlyxLz+2LFjMplMmj9/vurVqyc3NzcFBwdrzZo15jrJycnq2bOnObZKlSrpk08+sYol9d40ceJEq/I2bdpY3Wcl6eTJk2rfvr38/PxUoEABtWrVSseOHZN06+d4RveC1POV2hsYHh6uwoULy8fHR6+//rqSkpIyde6B25EkIU+4fPmy/vjjD/Xp00eenp5p1qcmD1u3btWZM2d05swZlShRQpMnTzYvv/TSS5IkJycnffrpp9q7d69mz56tlStX6u2337ZqLz4+XmPHjtW3336rDRs26OrVq+rQoUOmYl+3bp26dOmi/v37a9++ffryyy81a9YsjR079o7bNW3a1Bz7mTNnNHnyZKv1sbGxeu6557RixQrt2LFDTZs2VYsWLXTixAmreuPHj1eNGjW0Y8cODRs2TP3799eff/5pXm/L+fDw8NCmTZt06tQpc9n06dNVvHhxq3pvvvmmNm3apPnz52v37t1q166dmjZtqkOHDqlevXrmY/nxxx8lyer4Ujny3APAnWzbtk3t27dXhw4dtGfPHo0aNUojRoyw+mVfkkaPHq0zZ84oMjJSnp6e6tOnT6b2FxERoW+//VZffPGF9u7dq7feekudO3e2SlbSk7r/1E/79u2t1sfFxWngwIGKjIzUihUr5OTkpDZt2iglJcWq3pAhQzRo0CDt2LFDdevWVYsWLXTp0iVJtxK4EiVKaNGiRdq3b59Gjhypd955RwsXLrRqo3jx4poxY4Z5+fTp09qwYYM8PDzMZTdu3FBISIi8vb21bt06bdiwQV5eXmratKmSkpI0ePBg87EMGjRIdevWNS8vXrzY3M6KFSu0f/9+rV69WvPmzdPixYsVHh5u30kHMmIAecCWLVsMScbixYtt3iYoKMiYOXPmXestWrTIKFiwoHl55syZhiRj8+bN5rL9+/cbkowtW7YYhmEYYWFhRo0aNdJtb8mSJYblt17jxo2NDz74wKrOnDlzjGLFimUYU9euXY1WrVpZlc2cOdPw9fW947FUq1bN+Oyzz8zLQUFBRtOmTa3qvPTSS0azZs0ybCO98+Hr62v07dvXGD16tGEYhrFu3TqjevXqRqtWrYyuXbsahmEYx48fN/Lly2ecOnXKqr3GjRsbw4cPtypbtWqVkd6PJ0efewAPpq5duxr58uUzPD09rT5ubm6GJOPKlSuGYRhGaGio8eyzz1ptO2TIEKNq1arm5aCgIGPSpEmGYRjG9evXjXbt2llt07BhQ6N///7pxtG/f3+jYcOGhmEYRkJCguHh4WFs3LjRqk7Pnj2Njh07Zngslvu3PL7b7xGWLly4YEgy9uzZYxiGYURFRRmSjA8//NBc58aNG0aJEiWMcePGZdhOnz59jBdeeCHNfh966CFj7dq1hmEYxpgxY4y+ffsavr6+5nvunDlzjEqVKhkpKSnmbRMTEw13d3dj+fLlVvsICwszn6Pbj7FAgQJGXFycuWzatGmGl5eXkZycbBiG7eceSA89ScgTDMNwWFt//fWXGjdurOLFi8vb21svv/yyLl26pPj4eHMdZ2dnPfLII+blypUry8/PT/v37zeX7dmzR15eXvL19VWVKlX04Ycfpru/Xbt2afTo0fLy8jJ/evXqpTNnzljt016xsbEaPHiwqlSpIj8/P3l5eWn//v1pepLq1q2bZtnyOGw5H5L06quv6uuvv1ZKSoqmT5+uXr16Wa3fs2ePkpOTVbFiRatjXbNmjdWwj7tx5LkH8OB66qmntHPnTqvPV199ZVVn//79ql+/vlVZ/fr1dejQISUnJ5vLhg4dKi8vL3l6eurvv/9OM2z5888/l5eXlwoWLKjHHntMv/zyS5p4Dh8+rPj4eD377LNWPyO//fZbu35GpufQoUPq2LGjypYtKx8fH5UuXVqS7ng/cHZ2Vp06dax+tk6dOlW1a9dW4cKF5eXlpenTp6dpQ5J69eql6dOnKyUlRV9//XWa+8GuXbt0+PBheXt7m4+zQIECSkhIsOtYa9SoYdVDVbduXcXGxloNwbbl3APpYeIG5AkVKlSQyWS658kZjh07pueff169e/fW2LFjVaBAAa1fv149e/ZUUlKS1Q/ju6lUqZJ+/vlnJScna/PmzerVq5fKly8vZ2frb7vY2FiFh4erbdu2adpwc3PL9LEMHjxYf/75pz7++GOVL19e7u7uevHFF63Ga9+NPecjODhYgYGBmj9/vpYtW6ZPP/3UalbB2NhY5cuXT9u2bVO+fPms9uPl5ZXp40yPrecewIPL09NT5cuXtyr777//MtXWkCFD1K1bN8XFxenjjz9W+/btFRkZaf5Z16lTJ7377rtKTEzUzJkz9eKLL+ro0aNWbaQ+6/nrr7+mGars6uqaqbhStWjRQkFBQZoxY4YCAwOVkpKi4OBgu+4H8+fP1+DBgzVhwgTVrVtX3t7eGj9+vLZs2ZKmbufOnRUWFqb58+eraNGiql69utX62NhY1a5dW99//32abQsXLmz/Ad6BLeceSA+/MSBPKFCggEJCQjR16lT169cvzXNJV69etZrUICPbtm1TSkqKJkyYICenWx2tt4+3lqSbN28qMjJSjz76qCTp4MGDunr1qqpUqWKukz9/fvMNuFKlSpoyZYp27typOnXqWLVVq1YtHTx4MM3N+l5t2LBB3bp1U5s2bSTduimlPhRrKXVSC8vl1OOw9Xykeu211/T666+rdevWac53zZo1lZycrPPnz+vJJ5/M9HE58twDwJ1UqVJFGzZssCrbsGGDKlasaPXHnkKFCpl/5gwdOlTVq1dXVFSUuczX19f8dXh4uCZMmGDVQyNJVatWlaurq06cOKGGDRs67BguXbqkgwcPasaMGeafvevXr0+37ubNm9WgQQNJt37Wbtu2TW+++ab5uOvVq6c33njDXD+jXh8/Pz+1bNlSr7/+eprnZaVb970FCxYoICBAPj4+mT62Xbt26fr163J3dzfH7+XlpZIlS5rr2HLugfQw3A55xtSpU5WcnKxHH31UP/74ow4dOqT9+/fr008/TTOkLCPly5fXjRs39Nlnn+no0aOaM2eOvvjiizT1XFxc1LdvX23ZskXbtm1Tt27d9Pjjj5t/cZduDQFMSEhQXFycVq5cqX379ik4ODhNWyNHjtS3336r8PBw7d27V/v379f8+fP13nvvZf5k6Fbv2uLFi7Vz507t2rVLoaGhaR7SlW7d+D766CP9+++/mjp1qhYtWqT+/fvbdT5StW/fXu+++66GDx+eZl3FihXVqVMndenSRYsXL1ZUVJT+/vtvRURE6Ndff7X5uBx57gHgTgYNGqQVK1ZozJgx+vfffzV79mxNmTJFgwcPtqp37do1nT17VkePHtWUKVPk7e1t1RuUnJyshIQERUdH68svv5SLi4sqVapk1Ya3t7cGDx6st956S7Nnz9aRI0e0fft2ffbZZ5o9e3amj8Hf318FCxbU9OnTdfjwYa1cuVIDBw5Mt+7UqVO1ZMkSHThwQH369NGVK1fUo0cPSbfuKZGRkVq+fLn+/fdfjRgxQlu3bs1wv8OGDdM777xjnhTJUqdOnVSoUCG1atVK69atU1RUlFavXq1+/frZ1ZuXlJSknj17at++ffrtt98UFhamN9980/xHPcm2cw+khyQJeUbZsmW1fft2PfXUUxo0aJCCg4P17LPPasWKFZo2bZpNbdSoUUMTJ07UuHHjFBwcrO+//14RERFp6nl4eGjo0KEKDQ1V/fr15eXlpQULFljV2b17t9zd3eXj46Nu3bpp0KBB6c7CFhISomXLlul///ufHnnkET3++OOaNGmSgoKCMnci/r+JEyfK399f9erVU4sWLRQSEqJatWqlqTdo0CBFRkaqZs2aev/99zVx4kSFhITYdT5Subu7a+jQoVa9OpZmzpypLl26aNCgQapUqZJat26trVu3qlSpUjYflyPPPQDcSa1atbRw4ULNnz9fwcHBGjlypEaPHm31agPp1h+7ihUrpuDgYG3fvl1Lly41925I0pQpU+Tu7q6AgAB98803+v777616O1KNGTNGI0aMUEREhKpUqaKmTZvq119/VZkyZTJ9DE5OTpo/f762bdum4OBgvfXWWxo/fny6dT/88EN9+OGHqlGjhtavX6+ff/5ZhQoVknRrpEDbtm310ksv6bHHHtOlS5esepVuV6lSJQ0bNizdGWc9PDy0du1alSpVSm3btlWVKlXUs2dPJSQk2NWz1LhxY1WoUEENGjTQSy+9pJYtW2rUqFFWdWw998DtTIYjn3gHHgCzZs3SgAED8sQb2UuXLq0BAwZk+B6J3CYvnXsAyC2OHTumMmXKaMeOHRm+Zy636datm65evaqlS5fmdCjIo+hJAgAAAAALJEkAAAAAYIHhdgAAAABggZ4kAAAAALBAkgQAAAAAFkiSAAAAAMACSRIAAAAAWCBJAgA41I0bN3I6BAAA7glJEgDgnnzzzTd6+umnVapUKXl4eOjll1/O6ZAAALgnzjkdAADAMbp166bZs2dnuP7KlSvy8/Nz6D5fe+01/fHHHxo7dqzq1KkjZ2dnBQQEOHQfAABkN5IkAMhDmjZtqpkzZ1qVbdy4US+88ILD97Vu3TotWbJEu3btUrFixRzePgAAOYXhdgCQh7i6uqpo0aJWnwIFCqSp9+OPP6patWpydXVV6dKlNWHChDR1Zs2aJZPJZPV5+OGHzeuXLVum6tWr65VXXpGfn58KFCigbt26KTo62lwnJSVFo0ePVokSJeTq6qqHH35Yf/zxh3n9sWPHZDKZNH/+fNWrV09ubm4KDg7WmjVr7nicpUuXThObyWRS69atzXX++OMPPfHEE/Lz81PBggX1/PPP68iRI3btOzk5WT179lSZMmXk7u6uSpUq6ZNPPrGKpVu3bjKZTJo4caJVeZs2bWQymTRr1ixz2cmTJ9W+fXvz+WrVqpWOHTsmSRo1alS6x2QymdSoUSPzvlq3bq3w8HAVLlxYPj4+ev3115WUlGTeR2Jiovr166eAgAC5ubnpiSee0NatW83rV69ebW7XyclJAQEB6tmzpxISEu54zgHgQUKSBAAPmG3btql9+/bq0KGD9uzZo1GjRmnEiBFWv8yn8vHx0ZkzZ3TmzBkNGjTIat2FCxe0cuVKubm5ad26dVq6dKk2b96sHj16mOt88sknmjBhgj7++GPt3r1bISEhatmypQ4dOmTV1pAhQzRo0CDt2LFDdevWVYsWLXTp0qU7Hsfo0aPNsZ05c0bt27e3Wh8XF6eBAwcqMjJSK1askJOTk9q0aaOUlBSb952SkqISJUpo0aJF2rdvn0aOHKl33nlHCxcutGqjePHimjFjhnn59OnT2rBhgzw8PMxlN27cUEhIiLy9vbVu3Tpt2LBBXl5eatq0qZKSkjR48GCrc123bl3z8uLFi83trFixQvv379fq1as1b948LV68WOHh4eb1b7/9tn788UfNnj1b27dvV/ny5RUSEqLLly9bxXzw4EGdOnVK3333nRYsWJCmBxIAHmgGACBP6Nq1q9GqVas05atWrTIkGVeuXDEMwzBCQ0ONZ5991qrOkCFDjKpVq1qVffHFF0ahQoXMy2FhYUaNGjWs9ufv72/Exsaay9atW2dIMg4dOmQYhmEEBgYaY8eOtWr3kUceMd544w3DMAwjKirKkGR8+OGH5vU3btwwSpQoYYwbNy7DYw0KCjImTZpk0/GnunDhgiHJ2LNnzz3tu0+fPsYLL7yQZr8PPfSQsXbtWsMwDGPMmDFG3759DV9fX2PmzJmGYRjGnDlzjEqVKhkpKSnmbRMTEw13d3dj+fLlVvsICwszGjZsmGbfXbt2NQoUKGDExcWZy6ZNm2Z4eXkZycnJRmxsrOHi4mJ8//335vVJSUlGYGCg8dFHHxmGkfZ6OHTokOHv72+1DQA86OhJAoAHzP79+1W/fn2rsvr16+vQoUNKTk42l126dEk+Pj53bKtGjRry9PQ0Lz/++OPKly+f9u3bp5iYGJ0+fTrdfe3fv9+qrG7duuavnZ2dVadOnTR17HXo0CF17NhRZcuWlY+Pj0qXLi1JOnHihF37njp1qmrXrq3ChQvLy8tL06dPT9OGJPXq1UvTp09XSkqKvv76a/Xq1ctq/a5du3T48GF5e3vLy8tLXl5eKlCggBISEqyGAd5NjRo1rHqo6tatq9jYWJ08eVJHjhzRjRs3rM65i4uLHn300TTns0SJEvL09FSFChX03HPPqWPHjjbHAAB5HRM3AADSdfToUZUpUybD9f7+/jp+/Hi660wmU1aFZbMWLVooKChIM2bMUGBgoFJSUhQcHGz1/M7dzJ8/X4MHD9aECRNUt25deXt7a/z48dqyZUuaup07d1ZYWJjmz5+vokWLqnr16lbrY2NjVbt2bX3//fdpti1cuLD9B3iP1q1bJ29vb0VFRenVV1/VxIkT0wypBIAHFT1JAPCAqVKlijZs2GBVtmHDBlWsWFH58uUzl61du1ZPPvlkhu1UrlxZu3btUlxcnLls8+bNSk5OVpUqVeTj46PAwMB091W1alWrss2bN5u/vnnzprZt26YqVapk6vikW71gBw8e1HvvvafGjRurSpUqunLlSrp177TvDRs2qF69enrjjTdUs2ZNlS9fPsNeHz8/P7Vs2VKvv/56ml4kSapVq5YOHTqkgIAAlS9f3urj6+tr87Ht2rVL169ft4rfy8tLJUuWVLly5ZQ/f36rc37jxg1t3bo1zTkvU6aMypcvr2effVYvvPCClixZYnMMAJDXkSQBwANm0KBBWrFihcaMGaN///1Xs2fP1pQpUzR48GBJ0vXr1/XZZ5/pyJEjatasmc6ePauzZ88qNjZWN2/eNE8AEBoaKhcXF3Xp0kV79uzRunXr1KtXL7Vt21bly5eXdGtShHHjxmnBggU6ePCghg0bpp07d6p///5WMU2dOlVLlizRgQMH1KdPH125csVqAgh7+fv7q2DBgpo+fboOHz6slStXauDAgenWvdO+K1SooMjISC1fvlz//vuvRowYYTVT3O2GDRumd955Ry+99FKadZ06dVKhQoXUqlUrrVu3TlFRUVq9erX69eun//77z+ZjS0pKUs+ePbVv3z799ttvCgsL05tvviknJyd5enqqd+/eGjJkiP744w/t27dPvXr1Unx8vHr27GnVzvnz53X27Flt2bJFv/zyiypXrmxzDACQ1zHcDgAeMLVq1dLChQs1cuRIjRkzRsWKFdPo0aPVrVs3SdKCBQvUr18/SdJjjz2WZvu2bdtq9erV8vb21u+//66BAwfqkUcekYeHh1q1aqXJkyeb6/br10/R0dEaNGiQzp8/r6pVq+rnn39WhQoVrNr88MMP9eGHH2rnzp0qX768fv75ZxUqVCjTx+jk5KT58+erX79+Cg4OVqVKlfTpp5+ap9K2dd+vvfaaduzYoZdeekkmk0kdO3bUG2+8od9//z3d/VaqVEnDhg1Ld52Hh4fWrl2roUOHqm3btrp27ZqKFy+uxo0b3/XZL0uNGzdWhQoV1KBBAyUmJqpjx44aNWqU1fGkpKTo5Zdf1rVr11SnTh0tX75c/v7+aWKVpEKFCqlJkyb66KOPbI4BAPI6k2EYRk4HAQDIPWbNmqXVq1enOyX4zp07NWDAAK1evdoh+zp27JjKlCmjHTt2WL2DKTvk5L4zq1u3brp69aqWLl2a06EAQJ7GcDsAgBV3d/cMn5FxcXFJ9+W0AADkJQy3AwBYeemll9J9pkaSqlWrZvViUwAA8iKG2wEAAACABYbbAQAAAIAFkiQAAAAAsECSBAAAAAAWSJIAAAAAwAJJEgAAAABYIEkCAAAAAAskSQAAAABggSQJAAAAACz8PyI0BN5yWyWVAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn import metrics\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.model_selection import train_test_split, GridSearchCV\n", + "from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.impute import SimpleImputer\n", + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "# Определение столбцов для обработки\n", + "num_columns = features.select_dtypes(include=['number']).columns\n", + "cat_columns = features.select_dtypes(include=['object']).columns\n", + "\n", + "# Препроцессинг числовых столбцов\n", + "num_imputer = SimpleImputer(strategy=\"median\") # Используем медиану для заполнения пропущенных значений в числовых столбцах\n", + "num_scaler = StandardScaler()\n", + "preprocessing_num = Pipeline(\n", + " [\n", + " (\"imputer\", num_imputer),\n", + " (\"scaler\", num_scaler),\n", + " ]\n", + ")\n", + "\n", + "# Препроцессинг категориальных столбцов\n", + "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\") # Используем 'unknown' для заполнения пропущенных значений в категориальных столбцах\n", + "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n", + "preprocessing_cat = Pipeline(\n", + " [\n", + " (\"imputer\", cat_imputer),\n", + " (\"encoder\", cat_encoder),\n", + " ]\n", + ")\n", + "\n", + "# Объединение препроцессинга\n", + "features_preprocessing = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"preprocessing_num\", preprocessing_num, num_columns),\n", + " (\"preprocessing_cat\", preprocessing_cat, cat_columns),\n", + " ],\n", + " remainder=\"passthrough\"\n", + ")\n", + "\n", + "# Создание финального пайплайна\n", + "pipeline_end = Pipeline(\n", + " [\n", + " (\"features_preprocessing\", features_preprocessing),\n", + " ]\n", + ")\n", + "\n", + "# Разделение данных на обучающую и тестовую выборки\n", + "X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)\n", + "\n", + "# Применение пайплайна к данным\n", + "X_train_processed = pipeline_end.fit_transform(X_train)\n", + "X_test_processed = pipeline_end.transform(X_test)\n", + "\n", + "# 1. Настройка параметров для старых значений\n", + "old_param_grid = {\n", + " 'n_estimators': [50, 100, 200], # Количество деревьев\n", + " 'max_depth': [None, 10, 20, 30], # Максимальная глубина дерева\n", + " 'min_samples_split': [2, 5, 10] # Минимальное количество образцов для разбиения узла\n", + "}\n", + "\n", + "# Подбор гиперпараметров с помощью Grid Search для старых параметров\n", + "old_grid_search = GridSearchCV(estimator=RandomForestRegressor(), \n", + " param_grid=old_param_grid,\n", + " scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=2)\n", + "\n", + "# Обучение модели на тренировочных данных\n", + "old_grid_search.fit(X_train_processed, y_train)\n", + "\n", + "# 2. Результаты подбора для старых параметров\n", + "old_best_params = old_grid_search.best_params_\n", + "old_best_mse = -old_grid_search.best_score_ # Меняем знак, так как берем отрицательное значение MSE\n", + "\n", + "# 3. Настройка параметров для новых значений\n", + "new_param_grid = {\n", + " 'n_estimators': [200],\n", + " 'max_depth': [10],\n", + " 'min_samples_split': [10]\n", + "}\n", + "\n", + "# Подбор гиперпараметров с помощью Grid Search для новых параметров\n", + "new_grid_search = GridSearchCV(estimator=RandomForestRegressor(), \n", + " param_grid=new_param_grid,\n", + " scoring='neg_mean_squared_error', cv=2)\n", + "\n", + "# Обучение модели на тренировочных данных\n", + "new_grid_search.fit(X_train_processed, y_train)\n", + "\n", + "# 4. Результаты подбора для новых параметров\n", + "new_best_params = new_grid_search.best_params_\n", + "new_best_mse = -new_grid_search.best_score_ # Меняем знак, так как берем отрицательное значение MSE\n", + "\n", + "# 5. Обучение модели с лучшими параметрами для новых значений\n", + "model_best = RandomForestRegressor(**new_best_params)\n", + "model_best.fit(X_train_processed, y_train)\n", + "\n", + "# Прогнозирование на тестовой выборке\n", + "y_pred = model_best.predict(X_test_processed)\n", + "\n", + "# Оценка производительности модели\n", + "mse = metrics.mean_squared_error(y_test, y_pred)\n", + "rmse = np.sqrt(mse)\n", + "\n", + "# Вывод результатов\n", + "print(\"Старые параметры:\", old_best_params)\n", + "print(\"Лучший результат (MSE) на старых параметрах:\", old_best_mse)\n", + "print(\"\\nНовые параметры:\", new_best_params)\n", + "print(\"Лучший результат (MSE) на новых параметрах:\", new_best_mse)\n", + "print(\"Среднеквадратическая ошибка (MSE) на тестовых данных:\", mse)\n", + "print(\"Корень среднеквадратичной ошибки (RMSE) на тестовых данных:\", rmse)\n", + "\n", + "# Визуализация ошибок\n", + "plt.figure(figsize=(10, 5))\n", + "plt.bar(['Старые параметры', 'Новые параметры'], [old_best_mse, new_best_mse], color=['blue', 'orange'])\n", + "plt.xlabel('Подбор параметров')\n", + "plt.ylabel('Среднеквадратическая ошибка (MSE)')\n", + "plt.title('Сравнение MSE для старых и новых параметров')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Модель, обученная на новых параметрах, показала худший результат (MSE) на кросс-валидации, что указывает на ее меньшую точность по сравнению с моделью, обученной на старых параметрах. Однако, MSE на тестовых данных одинакова для обеих моделей, что говорит о том, что обе модели имеют одинаковую производительность на тестовых данных." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "aimenv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}