{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Вариант: Список людей. " ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 10000 entries, 0 to 9999\n", "Data columns (total 10 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Id 10000 non-null object \n", " 1 Name 10000 non-null object \n", " 2 Short description 9996 non-null object \n", " 3 Gender 9927 non-null object \n", " 4 Country 9721 non-null object \n", " 5 Occupation 9836 non-null object \n", " 6 Birth year 10000 non-null int64 \n", " 7 Death year 9999 non-null float64\n", " 8 Manner of death 1893 non-null object \n", " 9 Age of death 9999 non-null float64\n", "dtypes: float64(2), int64(1), object(7)\n", "memory usage: 781.4+ KB\n" ] } ], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from sklearn.preprocessing import LabelEncoder\n", "from imblearn.over_sampling import RandomOverSampler\n", "from imblearn.under_sampling import RandomUnderSampler\n", "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.linear_model import LinearRegression, LogisticRegression\n", "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier\n", "from sklearn.model_selection import train_test_split, GridSearchCV\n", "from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score\n", "import numpy as np\n", "import featuretools as ft\n", "from sklearn.metrics import accuracy_score, classification_report\n", "\n", "# Функция для применения oversampling\n", "def apply_oversampling(X, y):\n", " oversampler = RandomOverSampler(random_state=42)\n", " X_resampled, y_resampled = oversampler.fit_resample(X, y)\n", " return X_resampled, y_resampled\n", "\n", "# Функция для применения undersampling\n", "def apply_undersampling(X, y):\n", " undersampler = RandomUnderSampler(random_state=42)\n", " X_resampled, y_resampled = undersampler.fit_resample(X, y)\n", " return X_resampled, y_resampled\n", "\n", "def split_stratified_into_train_val_test(\n", " df_input,\n", " stratify_colname=\"y\",\n", " frac_train=0.6,\n", " frac_val=0.15,\n", " frac_test=0.25,\n", " random_state=None,\n", "):\n", " \"\"\"\n", " Splits a Pandas dataframe into three subsets (train, val, and test)\n", " following fractional ratios provided by the user, where each subset is\n", " stratified by the values in a specific column (that is, each subset has\n", " the same relative frequency of the values in the column). It performs this\n", " splitting by running train_test_split() twice.\n", "\n", " Parameters\n", " ----------\n", " df_input : Pandas dataframe\n", " Input dataframe to be split.\n", " stratify_colname : str\n", " The name of the column that will be used for stratification. Usually\n", " this column would be for the label.\n", " frac_train : float\n", " frac_val : float\n", " frac_test : float\n", " The ratios with which the dataframe will be split into train, val, and\n", " test data. The values should be expressed as float fractions and should\n", " sum to 1.0.\n", " random_state : int, None, or RandomStateInstance\n", " Value to be passed to train_test_split().\n", "\n", " Returns\n", " -------\n", " df_train, df_val, df_test :\n", " Dataframes containing the three splits.\n", " \"\"\"\n", "\n", " if frac_train + frac_val + frac_test != 1.0:\n", " raise ValueError(\n", " \"fractions %f, %f, %f do not add up to 1.0\"\n", " % (frac_train, frac_val, frac_test)\n", " )\n", "\n", " if stratify_colname not in df_input.columns:\n", " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", "\n", " X = df_input # Contains all columns.\n", " y = df_input[\n", " [stratify_colname]\n", " ] # Dataframe of just the column on which to stratify.\n", "\n", " # Split original dataframe into train and temp dataframes.\n", " df_train, df_temp, y_train, y_temp = train_test_split(\n", " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", " )\n", "\n", " # Split the temp dataframe into val and test dataframes.\n", " relative_frac_test = frac_test / (frac_val + frac_test)\n", " df_val, df_test, y_val, y_test = train_test_split(\n", " df_temp,\n", " y_temp,\n", " stratify=y_temp,\n", " test_size=relative_frac_test,\n", " random_state=random_state,\n", " )\n", "\n", " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", "\n", " return df_train, df_val, df_test\n", "\n", "\n", "df = pd.read_csv(\"../data/age.csv\", nrows=10000)\n", "df.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Как бизнес-цели выделим следующие 2 варианта:\n", " 1) GameDev. Создание игры про конкретного персонажа, живущего в конкретном временном промежутке в конкретной стране. \n", " 2) Исследование зависимости длительности жизни от страны проживания.\n", " \n", "Поскольку именно эти бизнес-цели были выбраны в предыдущей лабораторной работе, будем их использовать.\n", "Но возникает проблема с 1 целью: её невозможно использовать для задачи классификации. Заменим ее на классификацию людей по возрастным группам, что может быть полезно для рекламных целей." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Выполним подготовку данных" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "df.fillna({\"Gender\": \"NaN\", \"Country\": \"NaN\", \"Occupation\" : \"NaN\", \"Manner of death\" : \"NaN\"}, inplace=True)\n", "df = df.dropna()\n", "df['Country'] = df['Country'].str.split('; ')\n", "df = df.explode('Country')\n", "data = df.copy()\n", "\n", "value_counts = data[\"Country\"].value_counts()\n", "rare = value_counts[value_counts < 100].index\n", "data = data[~data[\"Country\"].isin(rare)]\n", "\n", "data.drop(data[~data['Gender'].isin(['Male', 'Female'])].index, inplace=True)\n", "\n", "data1 = pd.get_dummies(data, columns=['Gender', 'Country', 'Occupation'], drop_first=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Определить достижимый уровень качества модели для каждой задачи. На основе имеющихся данных уровень качества моделей не будет высоким, поскольку все таки длительность жизни лишь примерная и точно ее угадать невозможно." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Выберем ориентиры для наших 2х задач:\n", " 1)Регрессии - средний возраст человека\n", " 2)Классификации - аиболее часто встречающаяся возрастная группа" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Построим конвейер." ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['Id', 'Name', 'Short description', 'Birth year', 'Death year',\n", " 'Age of death', 'Gender_Male', 'Country_France',\n", " 'Country_German Confederation', 'Country_German Democratic Republic',\n", " ...\n", " 'Manner of death_euthanasia', 'Manner of death_homicide',\n", " 'Manner of death_homicide; natural causes',\n", " 'Manner of death_internal bleeding', 'Manner of death_natural causes',\n", " 'Manner of death_suicide',\n", " 'Manner of death_suicide; homicide; accident',\n", " 'Manner of death_suicide; unfortunate accident',\n", " 'Manner of death_summary execution', 'Manner of death_unnatural death'],\n", " dtype='object', length=400)\n" ] } ], "source": [ "print(data.columns)\n" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Best parameters for Linear Regression: {}\n", "Best parameters for Random Forest Regressor: {'model__max_depth': None, 'model__n_estimators': 100}\n", "Best parameters for Gradient Boosting Regressor: {'model__learning_rate': 0.2, 'model__max_depth': 7, 'model__n_estimators': 300}\n", "Linear Regression: MSE = 0.002807184047660083, R2 = 0.9999899555289343\n", "Random Forest Regressor: MSE = 11.46917740409879, R2 = 0.9589617856804076\n", "Gradient Boosting Regressor: MSE = 8.202651735797296, R2 = 0.9706498410424512\n" ] } ], "source": [ "X_reg = data1.drop(['Id', 'Name', 'Age of death', 'Short description', 'Manner of death'], axis=1)\n", "y_reg = data1['Age of death']\n", "\n", "# Разделение данных\n", "X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n", "\n", "# Выбор моделей для регрессии\n", "models_reg = {\n", " 'Linear Regression': LinearRegression(),\n", " 'Random Forest Regressor': RandomForestRegressor(random_state=42),\n", " 'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42)\n", "}\n", "\n", "# Создание конвейера для регрессии\n", "pipelines_reg = {}\n", "for name, model in models_reg.items():\n", " pipelines_reg[name] = Pipeline([\n", " ('scaler', StandardScaler()),\n", " ('model', model)\n", " ])\n", "\n", "# Определение сетки гиперпараметров для регрессии\n", "param_grids_reg = {\n", " 'Linear Regression': {},\n", " 'Random Forest Regressor': {\n", " 'model__n_estimators': [100, 200, 300],\n", " 'model__max_depth': [None, 10, 20, 30]\n", " },\n", " 'Gradient Boosting Regressor': {\n", " 'model__n_estimators': [100, 200, 300],\n", " 'model__learning_rate': [0.01, 0.1, 0.2],\n", " 'model__max_depth': [3, 5, 7]\n", " }\n", "}\n", "\n", "# Настройка гиперпараметров для регрессии\n", "best_models_reg = {}\n", "for name, pipeline in pipelines_reg.items():\n", " grid_search = GridSearchCV(pipeline, param_grids_reg[name], cv=5, scoring='neg_mean_squared_error')\n", " grid_search.fit(X_train_reg, y_train_reg)\n", " best_models_reg[name] = grid_search.best_estimator_\n", " print(f'Best parameters for {name}: {grid_search.best_params_}')\n", "\n", "# Обучение моделей для регрессии\n", "for name, model in best_models_reg.items():\n", " model.fit(X_train_reg, y_train_reg)\n", "\n", "# Оценка качества моделей для регрессии\n", "for name, model in best_models_reg.items():\n", " y_pred_reg = model.predict(X_test_reg)\n", " mse = mean_squared_error(y_test_reg, y_pred_reg)\n", " r2 = r2_score(y_test_reg, y_pred_reg)\n", " print(f'{name}: MSE = {mse}, R2 = {r2}')" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "data2 = data.drop(['Short description', 'Manner of death', 'Gender', 'Country', 'Occupation'], axis=1)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['Birth year', 'Death year'], dtype='object')\n", "Best parameters for Logistic Regression: {'model__C': 10, 'model__solver': 'lbfgs'}\n", "Best parameters for Random Forest Classifier: {'model__max_depth': 30, 'model__n_estimators': 200}\n", "Best parameters for Gradient Boosting Classifier: {'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__n_estimators': 200}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Logistic Regression: Accuracy = 0.9248554913294798\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0-18 0.00 0.00 0.00 5\n", " 19-30 0.50 0.08 0.14 60\n", " 31-50 0.77 0.77 0.77 242\n", " 51-70 0.91 0.96 0.94 650\n", " 71+ 0.98 1.00 0.99 946\n", "\n", " accuracy 0.92 1903\n", " macro avg 0.63 0.56 0.57 1903\n", "weighted avg 0.91 0.92 0.91 1903\n", "\n", "Random Forest Classifier: Accuracy = 0.9485023646873357\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0-18 0.67 0.40 0.50 5\n", " 19-30 0.96 0.77 0.85 60\n", " 31-50 0.88 0.89 0.88 242\n", " 51-70 0.92 0.95 0.94 650\n", " 71+ 0.99 0.97 0.98 946\n", "\n", " accuracy 0.95 1903\n", " macro avg 0.88 0.80 0.83 1903\n", "weighted avg 0.95 0.95 0.95 1903\n", "\n", "Gradient Boosting Classifier: Accuracy = 0.9379926431949553\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0-18 1.00 0.40 0.57 5\n", " 19-30 0.96 0.77 0.85 60\n", " 31-50 0.87 0.87 0.87 242\n", " 51-70 0.90 0.95 0.92 650\n", " 71+ 0.98 0.96 0.97 946\n", "\n", " accuracy 0.94 1903\n", " macro avg 0.94 0.79 0.84 1903\n", "weighted avg 0.94 0.94 0.94 1903\n", "\n" ] } ], "source": [ "# Создание возрастных групп\n", "bins = [0, 18, 30, 50, 70, 100]\n", "labels = ['0-18', '19-30', '31-50', '51-70', '71+']\n", "data['Age Group'] = pd.cut(data['Age of death'], bins=bins, labels=labels)\n", "\n", "# Выбор признаков и целевой переменной для классификации\n", "X_class = data2.drop(['Id', 'Name', 'Age of death', 'Age Group'], axis=1)\n", "y_class = data['Age Group'] \n", "print(X_class.columns)\n", "# Разделение данных\n", "X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n", "\n", "# Выбор моделей для классификации\n", "models_class = {\n", " 'Logistic Regression': LogisticRegression(random_state=42, max_iter=5000, solver='liblinear'),\n", " 'Random Forest Classifier': RandomForestClassifier(random_state=42),\n", " 'Gradient Boosting Classifier': GradientBoostingClassifier(random_state=42)\n", "}\n", "\n", "# Создание конвейера для классификации\n", "pipelines_class = {}\n", "for name, model in models_class.items():\n", " pipelines_class[name] = Pipeline([\n", " ('scaler', StandardScaler()),\n", " ('model', model)\n", " ])\n", "\n", "# Определение сетки гиперпараметров для классификации\n", "'''\n", "param_grids_class = {\n", " 'Logistic Regression': {\n", " 'model__C': [0.1, 1, 10],\n", " 'model__solver': ['lbfgs', 'liblinear']\n", " },\n", " 'Random Forest Classifier': {\n", " 'model__n_estimators': [100, 200, 300],\n", " 'model__max_depth': [None, 10, 20, 30]\n", " },\n", " 'Gradient Boosting Classifier': {\n", " 'model__n_estimators': [100, 200, 300],\n", " 'model__learning_rate': [0.01, 0.1, 0.2],\n", " 'model__max_depth': [3, 5, 7]\n", " }\n", "}'''\n", "# Убрал определение параметров поскольку уже был предподсчет данных, но вылетела ошибка. Сохранил лучшие параметры\n", "\n", "param_grids_class = {\n", " 'Logistic Regression': {\n", " 'model__C': [10],\n", " 'model__solver': ['lbfgs']\n", " },\n", " 'Random Forest Classifier': {\n", " 'model__n_estimators': [200],\n", " 'model__max_depth': [ 30]\n", " },\n", " 'Gradient Boosting Classifier': {\n", " 'model__n_estimators': [200],\n", " 'model__learning_rate': [0.1],\n", " 'model__max_depth': [7]\n", " }\n", "}\n", "\n", "# Настройка гиперпараметров для классификации\n", "best_models_class = {}\n", "for name, pipeline in pipelines_class.items():\n", " grid_search = GridSearchCV(pipeline, param_grids_class[name], cv=5, scoring='accuracy')\n", " grid_search.fit(X_train_class, y_train_class)\n", " best_models_class[name] = grid_search.best_estimator_\n", " print(f'Best parameters for {name}: {grid_search.best_params_}')\n", "\n", "# Обучение моделей для классификации\n", "for name, model in best_models_class.items():\n", " model.fit(X_train_class, y_train_class)\n", "\n", "# Оценка качества моделей для классификации\n", "for name, model in best_models_class.items():\n", " y_pred_class = model.predict(X_test_class)\n", " accuracy = accuracy_score(y_test_class, y_pred_class)\n", " report = classification_report(y_test_class, y_pred_class)\n", " print(f'{name}: Accuracy = {accuracy}')\n", " print(f'Classification Report:\\n{report}')" ] } ], "metadata": { "kernelspec": { "display_name": "aimvenv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.5" } }, "nbformat": 4, "nbformat_minor": 2 }