AIM-PIbd-31-Kozyrev-S-S/lab_4/lab_4.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Вариант: Список людей. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 10000 entries, 0 to 9999\n",
      "Data columns (total 10 columns):\n",
      " #   Column             Non-Null Count  Dtype  \n",
      "---  ------             --------------  -----  \n",
      " 0   Id                 10000 non-null  object \n",
      " 1   Name               10000 non-null  object \n",
      " 2   Short description  9996 non-null   object \n",
      " 3   Gender             9927 non-null   object \n",
      " 4   Country            9721 non-null   object \n",
      " 5   Occupation         9836 non-null   object \n",
      " 6   Birth year         10000 non-null  int64  \n",
      " 7   Death year         9999 non-null   float64\n",
      " 8   Manner of death    1893 non-null   object \n",
      " 9   Age of death       9999 non-null   float64\n",
      "dtypes: float64(2), int64(1), object(7)\n",
      "memory usage: 781.4+ KB\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from imblearn.over_sampling import RandomOverSampler\n",
    "from imblearn.under_sampling import RandomUnderSampler\n",
    "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.linear_model import LinearRegression, LogisticRegression\n",
    "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier\n",
    "from sklearn.model_selection import train_test_split, GridSearchCV\n",
    "from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score\n",
    "import numpy as np\n",
    "import featuretools as ft\n",
    "from sklearn.metrics import accuracy_score, classification_report\n",
    "\n",
    "# Функция для применения oversampling\n",
    "def apply_oversampling(X, y):\n",
    "    oversampler = RandomOverSampler(random_state=42)\n",
    "    X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
    "    return X_resampled, y_resampled\n",
    "\n",
    "# Функция для применения undersampling\n",
    "def apply_undersampling(X, y):\n",
    "    undersampler = RandomUnderSampler(random_state=42)\n",
    "    X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
    "    return X_resampled, y_resampled\n",
    "\n",
    "def split_stratified_into_train_val_test(\n",
    "    df_input,\n",
    "    stratify_colname=\"y\",\n",
    "    frac_train=0.6,\n",
    "    frac_val=0.15,\n",
    "    frac_test=0.25,\n",
    "    random_state=None,\n",
    "):\n",
    "    \"\"\"\n",
    "    Splits a Pandas dataframe into three subsets (train, val, and test)\n",
    "    following fractional ratios provided by the user, where each subset is\n",
    "    stratified by the values in a specific column (that is, each subset has\n",
    "    the same relative frequency of the values in the column). It performs this\n",
    "    splitting by running train_test_split() twice.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    df_input : Pandas dataframe\n",
    "        Input dataframe to be split.\n",
    "    stratify_colname : str\n",
    "        The name of the column that will be used for stratification. Usually\n",
    "        this column would be for the label.\n",
    "    frac_train : float\n",
    "    frac_val   : float\n",
    "    frac_test  : float\n",
    "        The ratios with which the dataframe will be split into train, val, and\n",
    "        test data. The values should be expressed as float fractions and should\n",
    "        sum to 1.0.\n",
    "    random_state : int, None, or RandomStateInstance\n",
    "        Value to be passed to train_test_split().\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    df_train, df_val, df_test :\n",
    "        Dataframes containing the three splits.\n",
    "    \"\"\"\n",
    "\n",
    "    if frac_train + frac_val + frac_test != 1.0:\n",
    "        raise ValueError(\n",
    "            \"fractions %f, %f, %f do not add up to 1.0\"\n",
    "            % (frac_train, frac_val, frac_test)\n",
    "        )\n",
    "\n",
    "    if stratify_colname not in df_input.columns:\n",
    "        raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
    "\n",
    "    X = df_input  # Contains all columns.\n",
    "    y = df_input[\n",
    "        [stratify_colname]\n",
    "    ]  # Dataframe of just the column on which to stratify.\n",
    "\n",
    "    # Split original dataframe into train and temp dataframes.\n",
    "    df_train, df_temp, y_train, y_temp = train_test_split(\n",
    "        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
    "    )\n",
    "\n",
    "    # Split the temp dataframe into val and test dataframes.\n",
    "    relative_frac_test = frac_test / (frac_val + frac_test)\n",
    "    df_val, df_test, y_val, y_test = train_test_split(\n",
    "        df_temp,\n",
    "        y_temp,\n",
    "        stratify=y_temp,\n",
    "        test_size=relative_frac_test,\n",
    "        random_state=random_state,\n",
    "    )\n",
    "\n",
    "    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
    "\n",
    "    return df_train, df_val, df_test\n",
    "\n",
    "\n",
    "df = pd.read_csv(\"../data/age.csv\", nrows=10000)\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Как бизнес-цели выделим следующие 2 варианта:\n",
    "    1) GameDev. Создание игры про конкретного персонажа, живущего в конкретном временном промежутке в конкретной стране. \n",
    "    2) Исследование зависимости длительности жизни от страны проживания.\n",
    "    \n",
    "Поскольку именно эти бизнес-цели были выбраны в предыдущей лабораторной работе, будем их использовать.\n",
    "Но возникает проблема с 1 целью: её невозможно использовать для задачи классификации. Заменим ее на классификацию людей по возрастным группам, что может быть полезно для рекламных целей."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Выполним подготовку данных"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.fillna({\"Gender\": \"NaN\", \"Country\": \"NaN\", \"Occupation\" : \"NaN\", \"Manner of death\" : \"NaN\"}, inplace=True)\n",
    "df = df.dropna()\n",
    "df['Country'] = df['Country'].str.split('; ')\n",
    "df = df.explode('Country')\n",
    "data = df.copy()\n",
    "\n",
    "value_counts = data[\"Country\"].value_counts()\n",
    "rare = value_counts[value_counts < 100].index\n",
    "data = data[~data[\"Country\"].isin(rare)]\n",
    "\n",
    "data.drop(data[~data['Gender'].isin(['Male', 'Female'])].index, inplace=True)\n",
    "\n",
    "data1 = pd.get_dummies(data, columns=['Gender', 'Country', 'Occupation'], drop_first=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Определить достижимый уровень качества модели для каждой задачи. На основе имеющихся данных уровень качества моделей не будет высоким, поскольку все таки длительность жизни лишь примерная и точно ее угадать невозможно."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Выберем ориентиры для наших 2х задач:\n",
    "    1)Регрессии - средний возраст человека\n",
    "    2)Классификации - аиболее часто встречающаяся возрастная группа"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Построим конвейер."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['Id', 'Name', 'Short description', 'Birth year', 'Death year',\n",
      "       'Age of death', 'Gender_Male', 'Country_France',\n",
      "       'Country_German Confederation', 'Country_German Democratic Republic',\n",
      "       ...\n",
      "       'Manner of death_euthanasia', 'Manner of death_homicide',\n",
      "       'Manner of death_homicide; natural causes',\n",
      "       'Manner of death_internal bleeding', 'Manner of death_natural causes',\n",
      "       'Manner of death_suicide',\n",
      "       'Manner of death_suicide; homicide; accident',\n",
      "       'Manner of death_suicide; unfortunate accident',\n",
      "       'Manner of death_summary execution', 'Manner of death_unnatural death'],\n",
      "      dtype='object', length=400)\n"
     ]
    }
   ],
   "source": [
    "print(data.columns)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters for Linear Regression: {}\n",
      "Best parameters for Random Forest Regressor: {'model__max_depth': None, 'model__n_estimators': 100}\n",
      "Best parameters for Gradient Boosting Regressor: {'model__learning_rate': 0.2, 'model__max_depth': 7, 'model__n_estimators': 300}\n",
      "Linear Regression: MSE = 0.002807184047660083, R2 = 0.9999899555289343\n",
      "Random Forest Regressor: MSE = 11.46917740409879, R2 = 0.9589617856804076\n",
      "Gradient Boosting Regressor: MSE = 8.202651735797296, R2 = 0.9706498410424512\n"
     ]
    }
   ],
   "source": [
    "X_reg = data1.drop(['Id', 'Name', 'Age of death', 'Short description', 'Manner of death'], axis=1)\n",
    "y_reg = data1['Age of death']\n",
    "\n",
    "# Разделение данных\n",
    "X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
    "\n",
    "# Выбор моделей для регрессии\n",
    "models_reg = {\n",
    "    'Linear Regression': LinearRegression(),\n",
    "    'Random Forest Regressor': RandomForestRegressor(random_state=42),\n",
    "    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42)\n",
    "}\n",
    "\n",
    "# Создание конвейера для регрессии\n",
    "pipelines_reg = {}\n",
    "for name, model in models_reg.items():\n",
    "    pipelines_reg[name] = Pipeline([\n",
    "        ('scaler', StandardScaler()),\n",
    "        ('model', model)\n",
    "    ])\n",
    "\n",
    "# Определение сетки гиперпараметров для регрессии\n",
    "param_grids_reg = {\n",
    "    'Linear Regression': {},\n",
    "    'Random Forest Regressor': {\n",
    "        'model__n_estimators': [100, 200, 300],\n",
    "        'model__max_depth': [None, 10, 20, 30]\n",
    "    },\n",
    "    'Gradient Boosting Regressor': {\n",
    "        'model__n_estimators': [100, 200, 300],\n",
    "        'model__learning_rate': [0.01, 0.1, 0.2],\n",
    "        'model__max_depth': [3, 5, 7]\n",
    "    }\n",
    "}\n",
    "\n",
    "# Настройка гиперпараметров для регрессии\n",
    "best_models_reg = {}\n",
    "for name, pipeline in pipelines_reg.items():\n",
    "    grid_search = GridSearchCV(pipeline, param_grids_reg[name], cv=5, scoring='neg_mean_squared_error')\n",
    "    grid_search.fit(X_train_reg, y_train_reg)\n",
    "    best_models_reg[name] = grid_search.best_estimator_\n",
    "    print(f'Best parameters for {name}: {grid_search.best_params_}')\n",
    "\n",
    "# Обучение моделей для регрессии\n",
    "for name, model in best_models_reg.items():\n",
    "    model.fit(X_train_reg, y_train_reg)\n",
    "\n",
    "# Оценка качества моделей для регрессии\n",
    "for name, model in best_models_reg.items():\n",
    "    y_pred_reg = model.predict(X_test_reg)\n",
    "    mse = mean_squared_error(y_test_reg, y_pred_reg)\n",
    "    r2 = r2_score(y_test_reg, y_pred_reg)\n",
    "    print(f'{name}: MSE = {mse}, R2 = {r2}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "data2 = data.drop(['Short description', 'Manner of death', 'Gender', 'Country', 'Occupation'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['Birth year', 'Death year'], dtype='object')\n",
      "Best parameters for Logistic Regression: {'model__C': 10, 'model__solver': 'lbfgs'}\n",
      "Best parameters for Random Forest Classifier: {'model__max_depth': 30, 'model__n_estimators': 200}\n",
      "Best parameters for Gradient Boosting Classifier: {'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__n_estimators': 200}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
      "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
      "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Logistic Regression: Accuracy = 0.9248554913294798\n",
      "Classification Report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "        0-18       0.00      0.00      0.00         5\n",
      "       19-30       0.50      0.08      0.14        60\n",
      "       31-50       0.77      0.77      0.77       242\n",
      "       51-70       0.91      0.96      0.94       650\n",
      "         71+       0.98      1.00      0.99       946\n",
      "\n",
      "    accuracy                           0.92      1903\n",
      "   macro avg       0.63      0.56      0.57      1903\n",
      "weighted avg       0.91      0.92      0.91      1903\n",
      "\n",
      "Random Forest Classifier: Accuracy = 0.9485023646873357\n",
      "Classification Report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "        0-18       0.67      0.40      0.50         5\n",
      "       19-30       0.96      0.77      0.85        60\n",
      "       31-50       0.88      0.89      0.88       242\n",
      "       51-70       0.92      0.95      0.94       650\n",
      "         71+       0.99      0.97      0.98       946\n",
      "\n",
      "    accuracy                           0.95      1903\n",
      "   macro avg       0.88      0.80      0.83      1903\n",
      "weighted avg       0.95      0.95      0.95      1903\n",
      "\n",
      "Gradient Boosting Classifier: Accuracy = 0.9379926431949553\n",
      "Classification Report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "        0-18       1.00      0.40      0.57         5\n",
      "       19-30       0.96      0.77      0.85        60\n",
      "       31-50       0.87      0.87      0.87       242\n",
      "       51-70       0.90      0.95      0.92       650\n",
      "         71+       0.98      0.96      0.97       946\n",
      "\n",
      "    accuracy                           0.94      1903\n",
      "   macro avg       0.94      0.79      0.84      1903\n",
      "weighted avg       0.94      0.94      0.94      1903\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Создание возрастных групп\n",
    "bins = [0, 18, 30, 50, 70, 100]\n",
    "labels = ['0-18', '19-30', '31-50', '51-70', '71+']\n",
    "data['Age Group'] = pd.cut(data['Age of death'], bins=bins, labels=labels)\n",
    "\n",
    "# Выбор признаков и целевой переменной для классификации\n",
    "X_class = data2.drop(['Id', 'Name', 'Age of death', 'Age Group'], axis=1)\n",
    "y_class = data['Age Group']  \n",
    "print(X_class.columns)\n",
    "# Разделение данных\n",
    "X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
    "\n",
    "# Выбор моделей для классификации\n",
    "models_class = {\n",
    "    'Logistic Regression': LogisticRegression(random_state=42, max_iter=5000, solver='liblinear'),\n",
    "    'Random Forest Classifier': RandomForestClassifier(random_state=42),\n",
    "    'Gradient Boosting Classifier': GradientBoostingClassifier(random_state=42)\n",
    "}\n",
    "\n",
    "# Создание конвейера для классификации\n",
    "pipelines_class = {}\n",
    "for name, model in models_class.items():\n",
    "    pipelines_class[name] = Pipeline([\n",
    "        ('scaler', StandardScaler()),\n",
    "        ('model', model)\n",
    "    ])\n",
    "\n",
    "# Определение сетки гиперпараметров для классификации\n",
    "'''\n",
    "param_grids_class = {\n",
    "    'Logistic Regression': {\n",
    "        'model__C': [0.1, 1, 10],\n",
    "        'model__solver': ['lbfgs', 'liblinear']\n",
    "    },\n",
    "    'Random Forest Classifier': {\n",
    "        'model__n_estimators': [100, 200, 300],\n",
    "        'model__max_depth': [None, 10, 20, 30]\n",
    "    },\n",
    "    'Gradient Boosting Classifier': {\n",
    "        'model__n_estimators': [100, 200, 300],\n",
    "        'model__learning_rate': [0.01, 0.1, 0.2],\n",
    "        'model__max_depth': [3, 5, 7]\n",
    "    }\n",
    "}'''\n",
    "# Убрал определение параметров поскольку уже был предподсчет данных, но вылетела ошибка. Сохранил лучшие параметры\n",
    "\n",
    "param_grids_class = {\n",
    "    'Logistic Regression': {\n",
    "        'model__C': [10],\n",
    "        'model__solver': ['lbfgs']\n",
    "    },\n",
    "    'Random Forest Classifier': {\n",
    "        'model__n_estimators': [200],\n",
    "        'model__max_depth': [ 30]\n",
    "    },\n",
    "    'Gradient Boosting Classifier': {\n",
    "        'model__n_estimators': [200],\n",
    "        'model__learning_rate': [0.1],\n",
    "        'model__max_depth': [7]\n",
    "    }\n",
    "}\n",
    "\n",
    "# Настройка гиперпараметров для классификации\n",
    "best_models_class = {}\n",
    "for name, pipeline in pipelines_class.items():\n",
    "    grid_search = GridSearchCV(pipeline, param_grids_class[name], cv=5, scoring='accuracy')\n",
    "    grid_search.fit(X_train_class, y_train_class)\n",
    "    best_models_class[name] = grid_search.best_estimator_\n",
    "    print(f'Best parameters for {name}: {grid_search.best_params_}')\n",
    "\n",
    "# Обучение моделей для классификации\n",
    "for name, model in best_models_class.items():\n",
    "    model.fit(X_train_class, y_train_class)\n",
    "\n",
    "# Оценка качества моделей для классификации\n",
    "for name, model in best_models_class.items():\n",
    "    y_pred_class = model.predict(X_test_class)\n",
    "    accuracy = accuracy_score(y_test_class, y_pred_class)\n",
    "    report = classification_report(y_test_class, y_pred_class)\n",
    "    print(f'{name}: Accuracy = {accuracy}')\n",
    "    print(f'Classification Report:\\n{report}')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "aimvenv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}