AIM-PIbd-31-Kozyrev-S-S/lab_3/lab_3.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Вариант: Список людей. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 100000 entries, 0 to 99999\n",
      "Data columns (total 10 columns):\n",
      " #   Column             Non-Null Count   Dtype  \n",
      "---  ------             --------------   -----  \n",
      " 0   Id                 100000 non-null  object \n",
      " 1   Name               100000 non-null  object \n",
      " 2   Short description  99923 non-null   object \n",
      " 3   Gender             98015 non-null   object \n",
      " 4   Country            94533 non-null   object \n",
      " 5   Occupation         97299 non-null   object \n",
      " 6   Birth year         100000 non-null  int64  \n",
      " 7   Death year         99999 non-null   float64\n",
      " 8   Manner of death    14821 non-null   object \n",
      " 9   Age of death       99999 non-null   float64\n",
      "dtypes: float64(2), int64(1), object(7)\n",
      "memory usage: 7.6+ MB\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from imblearn.over_sampling import RandomOverSampler\n",
    "from imblearn.under_sampling import RandomUnderSampler\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "import numpy as np\n",
    "import featuretools as ft\n",
    "\n",
    "\n",
    "# Функция для применения oversampling\n",
    "def apply_oversampling(X, y):\n",
    "    oversampler = RandomOverSampler(random_state=42)\n",
    "    X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
    "    return X_resampled, y_resampled\n",
    "\n",
    "# Функция для применения undersampling\n",
    "def apply_undersampling(X, y):\n",
    "    undersampler = RandomUnderSampler(random_state=42)\n",
    "    X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
    "    return X_resampled, y_resampled\n",
    "\n",
    "def split_stratified_into_train_val_test(\n",
    "    df_input,\n",
    "    stratify_colname=\"y\",\n",
    "    frac_train=0.6,\n",
    "    frac_val=0.15,\n",
    "    frac_test=0.25,\n",
    "    random_state=None,\n",
    "):\n",
    "    \"\"\"\n",
    "    Splits a Pandas dataframe into three subsets (train, val, and test)\n",
    "    following fractional ratios provided by the user, where each subset is\n",
    "    stratified by the values in a specific column (that is, each subset has\n",
    "    the same relative frequency of the values in the column). It performs this\n",
    "    splitting by running train_test_split() twice.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    df_input : Pandas dataframe\n",
    "        Input dataframe to be split.\n",
    "    stratify_colname : str\n",
    "        The name of the column that will be used for stratification. Usually\n",
    "        this column would be for the label.\n",
    "    frac_train : float\n",
    "    frac_val   : float\n",
    "    frac_test  : float\n",
    "        The ratios with which the dataframe will be split into train, val, and\n",
    "        test data. The values should be expressed as float fractions and should\n",
    "        sum to 1.0.\n",
    "    random_state : int, None, or RandomStateInstance\n",
    "        Value to be passed to train_test_split().\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    df_train, df_val, df_test :\n",
    "        Dataframes containing the three splits.\n",
    "    \"\"\"\n",
    "\n",
    "    if frac_train + frac_val + frac_test != 1.0:\n",
    "        raise ValueError(\n",
    "            \"fractions %f, %f, %f do not add up to 1.0\"\n",
    "            % (frac_train, frac_val, frac_test)\n",
    "        )\n",
    "\n",
    "    if stratify_colname not in df_input.columns:\n",
    "        raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
    "\n",
    "    X = df_input  # Contains all columns.\n",
    "    y = df_input[\n",
    "        [stratify_colname]\n",
    "    ]  # Dataframe of just the column on which to stratify.\n",
    "\n",
    "    # Split original dataframe into train and temp dataframes.\n",
    "    df_train, df_temp, y_train, y_temp = train_test_split(\n",
    "        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
    "    )\n",
    "\n",
    "    # Split the temp dataframe into val and test dataframes.\n",
    "    relative_frac_test = frac_test / (frac_val + frac_test)\n",
    "    df_val, df_test, y_val, y_test = train_test_split(\n",
    "        df_temp,\n",
    "        y_temp,\n",
    "        stratify=y_temp,\n",
    "        test_size=relative_frac_test,\n",
    "        random_state=random_state,\n",
    "    )\n",
    "\n",
    "    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
    "\n",
    "    return df_train, df_val, df_test\n",
    "\n",
    "\n",
    "df = pd.read_csv(\"../data/age.csv\", nrows=100000)\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Такую информацию могут использовать компании связанные с историей/культурой, с GameDev-ом, с созданием кинематографа. Реальные имена могут сделать тот же фильм более историчным. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Как бизнес-цели выделим следующие 2 варианта:\n",
    "    1) GameDev. Создание игры про конкретного персонажа, живущего в конкретном временном промежутке в конкретной стране. \n",
    "    2) Исследование зависимости длительности жизни от страны проживания.\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Поскольку данные не полные, их необходимо заполнить стандартными значениями:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Id                       0\n",
      "Name                     0\n",
      "Short description       77\n",
      "Gender                1985\n",
      "Country               5467\n",
      "Occupation            2701\n",
      "Birth year               0\n",
      "Death year               1\n",
      "Manner of death      85179\n",
      "Age of death             1\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print(df.isnull().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 99922 entries, 0 to 99999\n",
      "Data columns (total 10 columns):\n",
      " #   Column             Non-Null Count  Dtype  \n",
      "---  ------             --------------  -----  \n",
      " 0   Id                 99922 non-null  object \n",
      " 1   Name               99922 non-null  object \n",
      " 2   Short description  99922 non-null  object \n",
      " 3   Gender             99922 non-null  object \n",
      " 4   Country            99922 non-null  object \n",
      " 5   Occupation         99922 non-null  object \n",
      " 6   Birth year         99922 non-null  int64  \n",
      " 7   Death year         99922 non-null  float64\n",
      " 8   Manner of death    99922 non-null  object \n",
      " 9   Age of death       99922 non-null  float64\n",
      "dtypes: float64(2), int64(1), object(7)\n",
      "memory usage: 8.4+ MB\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>Name</th>\n",
       "      <th>Short description</th>\n",
       "      <th>Gender</th>\n",
       "      <th>Country</th>\n",
       "      <th>Occupation</th>\n",
       "      <th>Birth year</th>\n",
       "      <th>Death year</th>\n",
       "      <th>Manner of death</th>\n",
       "      <th>Age of death</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>99995</th>\n",
       "      <td>Q729652</td>\n",
       "      <td>Jacques-Joseph Moreau</td>\n",
       "      <td>French psychiatrist</td>\n",
       "      <td>Male</td>\n",
       "      <td>France</td>\n",
       "      <td>Psychiatrist; psychologist</td>\n",
       "      <td>1804</td>\n",
       "      <td>1884.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99996</th>\n",
       "      <td>Q729661</td>\n",
       "      <td>Jerome Wiesner</td>\n",
       "      <td>American academic engineer</td>\n",
       "      <td>Male</td>\n",
       "      <td>United States of America</td>\n",
       "      <td>Researcher</td>\n",
       "      <td>1915</td>\n",
       "      <td>1994.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>79.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99997</th>\n",
       "      <td>Q729662</td>\n",
       "      <td>Westmoreland Davis</td>\n",
       "      <td>American politician (1859-1942)</td>\n",
       "      <td>Male</td>\n",
       "      <td>United States of America</td>\n",
       "      <td>Politician</td>\n",
       "      <td>1859</td>\n",
       "      <td>1942.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>83.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99998</th>\n",
       "      <td>Q729674</td>\n",
       "      <td>John Needham</td>\n",
       "      <td>English biologist and Roman Catholic priest</td>\n",
       "      <td>Male</td>\n",
       "      <td>England</td>\n",
       "      <td>Religious figure</td>\n",
       "      <td>1713</td>\n",
       "      <td>1810.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>97.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99999</th>\n",
       "      <td>Q729679</td>\n",
       "      <td>Francis Bourne</td>\n",
       "      <td>Catholic cardinal</td>\n",
       "      <td>Male</td>\n",
       "      <td>United Kingdom</td>\n",
       "      <td>Religious figure</td>\n",
       "      <td>1861</td>\n",
       "      <td>1934.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>73.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            Id                   Name  \\\n",
       "99995  Q729652  Jacques-Joseph Moreau   \n",
       "99996  Q729661         Jerome Wiesner   \n",
       "99997  Q729662     Westmoreland Davis   \n",
       "99998  Q729674           John Needham   \n",
       "99999  Q729679         Francis Bourne   \n",
       "\n",
       "                                 Short description Gender  \\\n",
       "99995                          French psychiatrist   Male   \n",
       "99996                   American academic engineer   Male   \n",
       "99997              American politician (1859-1942)   Male   \n",
       "99998  English biologist and Roman Catholic priest   Male   \n",
       "99999                            Catholic cardinal   Male   \n",
       "\n",
       "                        Country                  Occupation  Birth year  \\\n",
       "99995                    France  Psychiatrist; psychologist        1804   \n",
       "99996  United States of America                  Researcher        1915   \n",
       "99997  United States of America                  Politician        1859   \n",
       "99998                   England            Religious figure        1713   \n",
       "99999            United Kingdom            Religious figure        1861   \n",
       "\n",
       "       Death year Manner of death  Age of death  \n",
       "99995      1884.0             NaN          80.0  \n",
       "99996      1994.0             NaN          79.0  \n",
       "99997      1942.0             NaN          83.0  \n",
       "99998      1810.0             NaN          97.0  \n",
       "99999      1934.0             NaN          73.0  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.fillna({\"Gender\": \"NaN\", \"Country\": \"NaN\", \"Occupation\" : \"NaN\", \"Manner of death\" : \"NaN\"}, inplace=True)\n",
    "df = df.dropna()\n",
    "df.info()\n",
    "df.tail()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Данные приращены, удалены только те строки, в которых не было даты смерти или короткого описания"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Axes: ylabel='Frequency'>"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlUAAAGdCAYAAAA7VYb2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA5oUlEQVR4nO3de1xVVeL///cRPCDGxRscGBHxnnezhmFSy9EBLx+zdD5TanljdGpwKjEzv6mp9UjDoqwspykvPbJ0/IxZo+WIl7QSNS+EWg9KU6kR1E9eTmgCwv790Y/98QgiHjZwDuf1fDz2Q/dea++99lnCebv2OvvYDMMwBAAAgCqpV9sNAAAAqAsIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAf/aboA3KCkp0YkTJxQcHCybzVbbzQEAAJVgGIZ++uknRUVFqV696h9HIlRVwokTJxQdHV3bzQAAAG74/vvv1bx582o/D6GqEoKDgyX90ikhISG13BoAAFAZTqdT0dHR5vt4dSNUVULpLb+QkBBCFQAAXqampu4wUR0AAMAChCoAAAALEKoAAAAswJwqixiGocuXL6u4uLi2mwKL1K9fX35+frXdDACAlyBUWaCwsFC5ubm6ePFibTcFFrLZbGrevLluuumm2m4KAMALEKqqqKSkREePHpWfn5+ioqJkt9t5QGgdYBiGTp8+rR9++EFt27ZlxAoAcF2EqioqLCxUSUmJoqOjFRQUVNvNgYWaNWumY8eOqaioiFAFALguJqpbpCYef4+axYgjAOBGkAQAAAAsQKhChY4dOyabzabMzEy39rfZbFq7dq2lbQIAwBMxp6oatXxifY2e79j8wTdUf+zYsVq+fLm53rhxY912221KTU1V165dJUnR0dHKzc1V06ZNKzzW7NmztXbtWrfDFwAA3o6RKh83YMAA5ebmKjc3V5s3b5a/v7/+67/+yyz38/OTw+GQv3/5+bv0+VzerrCwsLabAADwcoQqHxcQECCHwyGHw6Hu3bvriSee0Pfff6/Tp09LKnv775NPPpHNZtPHH3+snj17KiAgQO+8847mzJmjL7/8UjabTTabTcuWLTPP8b//+7+65557FBQUpLZt2+rDDz+8Znvmzp2rzp07l9nevXt3zZw501x/8803dfPNNyswMFAdOnTQa6+95lJ/2rRpateunYKCgtSqVSvNnDlTRUVFZvns2bPVvXt3vfnmm4qNjVVgYKA7Lx8AACZu/8GUn5+vd955R23atFGTJk0qrPvEE0/o+eefV6tWrRQYGKgpU6Zow4YN2rRpkyQpNDTUrDtnzhylpqZqwYIFeuWVVzRq1CgdP35cjRs3LnPc8ePHa86cOfriiy902223SZL279+vrKwsrVmzRpK0YsUKzZo1S6+++qp69Oih/fv3a8KECWrYsKHGjBkjSQoODtayZcsUFRWlAwcOaMKECQoODtbjjz9unuvw4cP65z//qTVr1vDIBABAlRGqfNy6devMJ4ZfuHBBkZGRWrdu3XUfETF37lz9/ve/N9dvuukm+fv7y+FwlKk7duxYjRgxQpL07LPP6uWXX9bu3bs1YMCAMnWbN2+uxMRELV261AxVS5cu1R133KFWrVpJkp566im98MILGjZsmCQpNjZWX331lf72t7+ZoWrGjBnmMVu2bKnHHntMK1eudAlVhYWFevvtt9WsWbPrv1AAgEpp+cT6G57jW1dw+8/H9e3bV5mZmcrMzNTu3buVmJiogQMH6vjx4xXud+utt1b6HKWT3iWpYcOGCgkJ0alTp65Zf8KECXrvvfd06dIlFRYW6t1339X48eMl/RL8jhw5oqSkJN10003m8swzz+jIkSPmMVatWqXbb79dDodDN910k2bMmKGcnByX88TExBCoAACWYaTKxzVs2FBt2rQx1998802Fhobq73//u5555pkK96us+vXru6zbbDaVlJRcs/6QIUMUEBCg999/X3a7XUVFRfrDH/4g6ZdblJL097//XXFxcS77ld7Cy8jI0KhRozRnzhwlJiYqNDRUK1eu1AsvvOD2NQAAcD2EKriw2WyqV6+efv755xvaz263q7i42JI2+Pv7a8yYMVq6dKnsdrvuu+8+NWjQQJIUERGhqKgofffddxo1alS5++/YsUMxMTF68sknzW3XG3kDAKCqCFU+rqCgQHl5eZKks2fP6tVXX1V+fr6GDBlyQ8dp2bKljh49qszMTDVv3lzBwcEKCAhwu11/+tOfdPPNN0uSPv/8c5eyOXPm6OGHH1ZoaKgGDBiggoIC7dmzR2fPnlVKSoratm2rnJwcrVy5UrfddpvWr1+v999/3+22AABQGcyp8nEbNmxQZGSkIiMjFRcXpy+++EKrV6/WnXfeeUPHGT58uAYMGKC+ffuqWbNmeu+996rUrrZt2+q3v/2tOnToUOY235/+9Ce9+eabWrp0qbp06aI77rhDy5YtU2xsrCTprrvu0uTJkzVp0iR1795dO3bscHkcAwAA1cFmGIZR243wdE6nU6GhoTp//rxCQkJcyi5duqSjR4/yrCOLGYahtm3b6i9/+YtSUlJqpQ30LQDcOE/69F9F79/VoVZHqrZv364hQ4YoKiqq3O+IK32Q5NXLggULzDotW7YsUz5//nyX42RlZal3794KDAxUdHS0UlNTa+Ly4KbTp0/r1VdfVV5ensaNG1fbzQEAoFJqdU7VhQsX1K1bN40fP9585tCVcnNzXdY//vhjJSUlafjw4S7b586dqwkTJpjrwcHB5t+dTqcSEhLUv39/LV68WAcOHND48eMVFhamiRMnWnxFsEJ4eLiaNm2qN954Q40aNart5gAAUCm1GqoGDhyogQMHXrP86gdJfvDBB+rbt6/5EMhSwcHB5T50Uvrl6duFhYVasmSJ7Ha7OnXqpMzMTKWlpRGqPBR3pAEA3shrJqqfPHlS69evV1JSUpmy+fPnq0mTJurRo4cWLFjg8gW/GRkZ6tOnj+x2u7ktMTFR2dnZOnv2bLnnKigokNPpdFkAAAAq4jWPVFi+fLmCg4PL3CZ8+OGHdcstt6hx48basWOHpk+frtzcXKWlpUmS8vLyzE+FlYqIiDDLyru9NG/ePM2ZM6eargQAANRFXhOqlixZolGjRpX5FNaVnwzr2rWr7Ha7/vznP2vevHluPydp+vTpLsd1Op2Kjo6ucB9uWdU99CkA4EZ4Raj69NNPlZ2drVWrVl23blxcnC5fvqxjx46pffv2cjgcOnnypEud0vVrzcMKCAiodCAr/QqWixcvmk/9Rt1QWFgo6f++/gYAgIp4Rah666231LNnT3Xr1u26dTMzM1WvXj2Fh4dLkuLj4/Xkk0+qqKjIDEDp6elq3769JZ8s8/PzU1hYmPkFwUFBQbLZbFU+LmpXSUmJTp8+raCgIPn7e8WPCQCgltXqu0V+fr4OHz5srpd+zUnjxo3VokULSb/celu9enWZL8OVfpmEvmvXLvXt21fBwcHKyMjQ5MmTdf/995uBaeTIkZozZ46SkpI0bdo0HTx4UAsXLtSLL75o2XWUjniVBivUDfXq1VOLFi0IyQCASqnVULVnzx717dvXXC+dxzRmzBgtW7ZMkrRy5UoZhqERI0aU2T8gIEArV67U7NmzVVBQoNjYWE2ePNllPlRoaKg2btyo5ORk9ezZU02bNtWsWbMsfZyCzWZTZGSkwsPDVVRUZNlxUbvsdrvq1fOaD8gCAGoZX1NTCTX9mHsAALwVX1MDAACAKiFUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQh
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df.plot.hist(column=[\"Birth year\"], xlim=(1000, 2000), bins=4000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Помимо этого обработаем колонку страны таким образом, что каждый человек, который жил не в одной стране, будет занимать более одной строки, в соответствии с количеством стран в которых он жил."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 116555 entries, 0 to 99999\n",
      "Data columns (total 10 columns):\n",
      " #   Column             Non-Null Count   Dtype  \n",
      "---  ------             --------------   -----  \n",
      " 0   Id                 116555 non-null  object \n",
      " 1   Name               116555 non-null  object \n",
      " 2   Short description  116555 non-null  object \n",
      " 3   Gender             116555 non-null  object \n",
      " 4   Country            116555 non-null  object \n",
      " 5   Occupation         116555 non-null  object \n",
      " 6   Birth year         116555 non-null  int64  \n",
      " 7   Death year         116555 non-null  float64\n",
      " 8   Manner of death    116555 non-null  object \n",
      " 9   Age of death       116555 non-null  float64\n",
      "dtypes: float64(2), int64(1), object(7)\n",
      "memory usage: 9.8+ MB\n"
     ]
    }
   ],
   "source": [
    "df['Country'] = df['Country'].str.split('; ')\n",
    "df = df.explode('Country')\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Далее выполним разбиение на обучающую, контрольную и тестовую выборки."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "121\n",
      "Обучающая выборка:  (67038, 10)\n",
      "Country\n",
      "Germany                       15128\n",
      "United States of America       8946\n",
      "France                         4715\n",
      "NaN                            3248\n",
      "United Kingdom                 2796\n",
      "                              ...  \n",
      "Song dynasty                     32\n",
      "Paraguay                         31\n",
      "Kingdom of Sardinia              31\n",
      "Confederation of the Rhine       30\n",
      "Kingdom of Saxony                30\n",
      "Name: count, Length: 121, dtype: int64\n",
      "Контрольная выборка:  (22346, 10)\n",
      "Country\n",
      "Germany                       5043\n",
      "United States of America      2982\n",
      "France                        1572\n",
      "NaN                           1082\n",
      "United Kingdom                 932\n",
      "                              ... \n",
      "Vietnam                         11\n",
      "Paraguay                        10\n",
      "Kingdom of Saxony               10\n",
      "Confederation of the Rhine      10\n",
      "Kingdom of Sardinia             10\n",
      "Name: count, Length: 121, dtype: int64\n",
      "Тестовая выборка:  (22347, 10)\n",
      "Country\n",
      "Germany                       5043\n",
      "United States of America      2982\n",
      "France                        1572\n",
      "NaN                           1083\n",
      "United Kingdom                 933\n",
      "                              ... \n",
      "England                         11\n",
      "Confederation of the Rhine      10\n",
      "Paraguay                        10\n",
      "Kingdom of Sardinia             10\n",
      "Kingdom of Saxony               10\n",
      "Name: count, Length: 121, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "data = df.copy()\n",
    "\n",
    "value_counts = data[\"Country\"].value_counts()\n",
    "rare = value_counts[value_counts < 50].index\n",
    "data = data[~data[\"Country\"].isin(rare)]\n",
    "\n",
    "print(len(data[\"Country\"].unique()))\n",
    "\n",
    "        \n",
    "df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
    "    data, stratify_colname=\"Country\", frac_train=0.60, frac_val=0.20, frac_test=0.20)\n",
    "\n",
    "print(\"Обучающая выборка: \", df_train.shape)\n",
    "print(df_train[\"Country\"].value_counts())\n",
    "\n",
    "print(\"Контрольная выборка: \", df_val.shape)\n",
    "print(df_val[\"Country\"].value_counts())\n",
    "\n",
    "print(\"Тестовая выборка: \", df_test.shape)\n",
    "print(df_test[\"Country\"].value_counts())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "В данных были удалены строки, у которых были \"редкие\" страны. Данные наращивать не будем, поскольку в этом нет необходимости\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Выполним конструирование признаков. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Начнем с унитарного кодирования категориальных признаков. Под этот пункт подходит столбец страна"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Country_Albania</th>\n",
       "      <th>Country_Argentina</th>\n",
       "      <th>Country_Australia</th>\n",
       "      <th>Country_Austria</th>\n",
       "      <th>Country_Austria-Hungary</th>\n",
       "      <th>Country_Austrian Empire</th>\n",
       "      <th>Country_Belgium</th>\n",
       "      <th>Country_Bolivia</th>\n",
       "      <th>Country_Brazil</th>\n",
       "      <th>Country_British Raj</th>\n",
       "      <th>...</th>\n",
       "      <th>Country_United Kingdom of Great Britain and Ireland</th>\n",
       "      <th>Country_United States of America</th>\n",
       "      <th>Country_Uruguay</th>\n",
       "      <th>Country_Venezuela</th>\n",
       "      <th>Country_Vietnam</th>\n",
       "      <th>Country_Wales</th>\n",
       "      <th>Country_Weimar Republic</th>\n",
       "      <th>Country_West Germany</th>\n",
       "      <th>Country_Yugoslavia</th>\n",
       "      <th>Country_ancient Rome</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111726</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111727</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111728</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111729</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111730</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>111731 rows × 120 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Country_Albania  Country_Argentina  Country_Australia  \\\n",
       "0                   0.0                0.0                0.0   \n",
       "1                   0.0                0.0                0.0   \n",
       "2                   0.0                0.0                0.0   \n",
       "3                   0.0                0.0                0.0   \n",
       "4                   0.0                0.0                0.0   \n",
       "...                 ...                ...                ...   \n",
       "111726              0.0                0.0                0.0   \n",
       "111727              0.0                0.0                0.0   \n",
       "111728              0.0                0.0                0.0   \n",
       "111729              0.0                0.0                0.0   \n",
       "111730              0.0                0.0                0.0   \n",
       "\n",
       "        Country_Austria  Country_Austria-Hungary  Country_Austrian Empire  \\\n",
       "0                   0.0                      0.0                      0.0   \n",
       "1                   0.0                      0.0                      0.0   \n",
       "2                   0.0                      0.0                      0.0   \n",
       "3                   0.0                      0.0                      0.0   \n",
       "4                   0.0                      0.0                      0.0   \n",
       "...                 ...                      ...                      ...   \n",
       "111726              0.0                      0.0                      0.0   \n",
       "111727              0.0                      0.0                      0.0   \n",
       "111728              0.0                      0.0                      0.0   \n",
       "111729              0.0                      0.0                      0.0   \n",
       "111730              0.0                      0.0                      0.0   \n",
       "\n",
       "        Country_Belgium  Country_Bolivia  Country_Brazil  Country_British Raj  \\\n",
       "0                   0.0              0.0             0.0                  0.0   \n",
       "1                   0.0              0.0             0.0                  0.0   \n",
       "2                   0.0              0.0             0.0                  0.0   \n",
       "3                   0.0              0.0             0.0                  0.0   \n",
       "4                   0.0              0.0             0.0                  0.0   \n",
       "...                 ...              ...             ...                  ...   \n",
       "111726              0.0              0.0             0.0                  0.0   \n",
       "111727              0.0              0.0             0.0                  0.0   \n",
       "111728              0.0              0.0             0.0                  0.0   \n",
       "111729              0.0              0.0             0.0                  0.0   \n",
       "111730              0.0              0.0             0.0                  0.0   \n",
       "\n",
       "        ...  Country_United Kingdom of Great Britain and Ireland  \\\n",
       "0       ...                                                0.0     \n",
       "1       ...                                                0.0     \n",
       "2       ...                                                0.0     \n",
       "3       ...                                                0.0     \n",
       "4       ...                                                0.0     \n",
       "...     ...                                                ...     \n",
       "111726  ...                                                0.0     \n",
       "111727  ...                                                0.0     \n",
       "111728  ...                                                0.0     \n",
       "111729  ...                                                0.0     \n",
       "111730  ...                                                0.0     \n",
       "\n",
       "        Country_United States of America  Country_Uruguay  Country_Venezuela  \\\n",
       "0                                    1.0              0.0                0.0   \n",
       "1                                    0.0              0.0                0.0   \n",
       "2                                    0.0              0.0                0.0   \n",
       "3                                    1.0              0.0                0.0   \n",
       "4                                    0.0              0.0                0.0   \n",
       "...                                  ...              ...                ...   \n",
       "111726                               0.0              0.0                0.0   \n",
       "111727                               1.0              0.0                0.0   \n",
       "111728                               1.0              0.0                0.0   \n",
       "111729                               0.0              0.0                0.0   \n",
       "111730                               0.0              0.0                0.0   \n",
       "\n",
       "        Country_Vietnam  Country_Wales  Country_Weimar Republic  \\\n",
       "0                   0.0            0.0                      0.0   \n",
       "1                   0.0            0.0                      0.0   \n",
       "2                   0.0            0.0                      0.0   \n",
       "3                   0.0            0.0                      0.0   \n",
       "4                   0.0            0.0                      0.0   \n",
       "...                 ...            ...                      ...   \n",
       "111726              0.0            0.0                      0.0   \n",
       "111727              0.0            0.0                      0.0   \n",
       "111728              0.0            0.0                      0.0   \n",
       "111729              0.0            0.0                      0.0   \n",
       "111730              0.0            0.0                      0.0   \n",
       "\n",
       "        Country_West Germany  Country_Yugoslavia  Country_ancient Rome  \n",
       "0                        0.0                 0.0                   0.0  \n",
       "1                        0.0                 0.0                   0.0  \n",
       "2                        0.0                 0.0                   0.0  \n",
       "3                        0.0                 0.0                   0.0  \n",
       "4                        0.0                 0.0                   0.0  \n",
       "...                      ...                 ...                   ...  \n",
       "111726                   0.0                 0.0                   0.0  \n",
       "111727                   0.0                 0.0                   0.0  \n",
       "111728                   0.0                 0.0                   0.0  \n",
       "111729                   0.0                 0.0                   0.0  \n",
       "111730                   0.0                 0.0                   0.0  \n",
       "\n",
       "[111731 rows x 120 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
    "\n",
    "encoded_values = encoder.fit_transform(data[[\"Country\"]])\n",
    "\n",
    "encoded_columns = encoder.get_feature_names_out([\"Country\"])\n",
    "\n",
    "encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
    "\n",
    "encoded_values_df\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Далее выполним дискретизацию числовых признаков"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age of death</th>\n",
       "      <th>Age of death</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>67.0</td>\n",
       "      <td>middle-aged</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>67.0</td>\n",
       "      <td>middle-aged</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>49.0</td>\n",
       "      <td>middle-aged</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>56.0</td>\n",
       "      <td>middle-aged</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>57.0</td>\n",
       "      <td>middle-aged</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>57.0</td>\n",
       "      <td>middle-aged</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>42.0</td>\n",
       "      <td>middle-aged</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>88.0</td>\n",
       "      <td>old</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>86.0</td>\n",
       "      <td>old</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>61.0</td>\n",
       "      <td>middle-aged</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>73.0</td>\n",
       "      <td>middle-aged</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>73.0</td>\n",
       "      <td>middle-aged</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>42.0</td>\n",
       "      <td>middle-aged</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>98.0</td>\n",
       "      <td>old</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>56.0</td>\n",
       "      <td>middle-aged</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>56.0</td>\n",
       "      <td>middle-aged</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>56.0</td>\n",
       "      <td>middle-aged</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>56.0</td>\n",
       "      <td>middle-aged</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>63.0</td>\n",
       "      <td>middle-aged</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>91.0</td>\n",
       "      <td>old</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Age of death Age of death\n",
       "0          67.0  middle-aged\n",
       "0          67.0  middle-aged\n",
       "1          49.0  middle-aged\n",
       "2          56.0  middle-aged\n",
       "4          57.0  middle-aged\n",
       "4          57.0  middle-aged\n",
       "5          42.0  middle-aged\n",
       "6          88.0          old\n",
       "7          86.0          old\n",
       "8          61.0  middle-aged\n",
       "9          73.0  middle-aged\n",
       "9          73.0  middle-aged\n",
       "10         42.0  middle-aged\n",
       "12         98.0          old\n",
       "13         56.0  middle-aged\n",
       "14         56.0  middle-aged\n",
       "14         56.0  middle-aged\n",
       "14         56.0  middle-aged\n",
       "16         63.0  middle-aged\n",
       "17         91.0          old"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels = [\"young\", \"middle-aged\", \"old\"]\n",
    "num_bins = 3\n",
    "hist1, bins1 = np.histogram(data[\"Age of death\"].fillna(data[\"Age of death\"].median()), bins=num_bins)\n",
    "pd.concat([data[\"Age of death\"], pd.cut(data[\"Age of death\"], list(bins1), labels=labels)], axis=1).head(20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Выполнить  «ручной» синтез признаков в рамках данного набора данных не является возможным."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Масштабирование признаков на основе нормировки и стандартизации в рамках данного набора данных не является необходимым."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Выполним конструирование признаков с применением фреймворка Featuretools. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n",
      "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n",
      "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n",
      "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n",
      "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n",
      "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n",
      "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n",
      "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n",
      "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n",
      "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n",
      "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n",
      "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n",
      "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n",
      "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n",
      "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Built 7 features\n",
      "Elapsed: 00:00 | Progress: 100%|██████████\n",
      "     Gender                   Country    Occupation  Birth year  Death year  \\\n",
      "Id                                                                            \n",
      "Q23    Male  United States of America    Politician        1732      1799.0   \n",
      "Q42    Male            United Kingdom        Artist        1952      2001.0   \n",
      "Q91    Male  United States of America    Politician        1809      1865.0   \n",
      "Q255   Male         Holy Roman Empire        Artist        1770      1827.0   \n",
      "Q260   Male         Kingdom of France  Egyptologist        1790      1832.0   \n",
      "\n",
      "     Manner of death  Age of death  \n",
      "Id                                  \n",
      "Q23   natural causes          67.0  \n",
      "Q42   natural causes          49.0  \n",
      "Q91         homicide          56.0  \n",
      "Q255             NaN          57.0  \n",
      "Q260  natural causes          42.0  \n"
     ]
    }
   ],
   "source": [
    "data1 = data.drop_duplicates(subset=\"Id\", keep=\"first\")\n",
    "\n",
    "df_train = pd.DataFrame(data1)\n",
    "\n",
    "# Создание EntitySet\n",
    "es = ft.EntitySet(id='death_data')\n",
    "\n",
    "# Добавление DataFrame в EntitySet\n",
    "es = es.add_dataframe(\n",
    "    dataframe_name='deaths',\n",
    "    dataframe=df_train,\n",
    "    index='Id',\n",
    "    make_index=False\n",
    ")\n",
    "\n",
    "# Определение примитивов (операций) для конструирования признаков\n",
    "feature_matrix, feature_defs = ft.dfs(\n",
    "    entityset=es,\n",
    "    target_dataframe_name='deaths',\n",
    "    max_depth=2,\n",
    "    verbose=1,\n",
    "    n_jobs=1\n",
    ")\n",
    "\n",
    "# Вывод сгенерированных признаков\n",
    "print(feature_matrix.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Все наборы признаков имеют плохую предсказательную способность, высокую скорость вычисления, малую надежность, корреляцию и цельность. Они не являются информативными, как и сам набор данных"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "aimvenv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}