1301 lines
70 KiB
Plaintext
1301 lines
70 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Вариант: Список людей. "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 27,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 100000 entries, 0 to 99999\n",
|
||
"Data columns (total 10 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 Id 100000 non-null object \n",
|
||
" 1 Name 100000 non-null object \n",
|
||
" 2 Short description 99923 non-null object \n",
|
||
" 3 Gender 98015 non-null object \n",
|
||
" 4 Country 94533 non-null object \n",
|
||
" 5 Occupation 97299 non-null object \n",
|
||
" 6 Birth year 100000 non-null int64 \n",
|
||
" 7 Death year 99999 non-null float64\n",
|
||
" 8 Manner of death 14821 non-null object \n",
|
||
" 9 Age of death 99999 non-null float64\n",
|
||
"dtypes: float64(2), int64(1), object(7)\n",
|
||
"memory usage: 7.6+ MB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.preprocessing import LabelEncoder\n",
|
||
"from imblearn.over_sampling import RandomOverSampler\n",
|
||
"from imblearn.under_sampling import RandomUnderSampler\n",
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"import numpy as np\n",
|
||
"import featuretools as ft\n",
|
||
"\n",
|
||
"\n",
|
||
"# Функция для применения oversampling\n",
|
||
"def apply_oversampling(X, y):\n",
|
||
" oversampler = RandomOverSampler(random_state=42)\n",
|
||
" X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
|
||
" return X_resampled, y_resampled\n",
|
||
"\n",
|
||
"# Функция для применения undersampling\n",
|
||
"def apply_undersampling(X, y):\n",
|
||
" undersampler = RandomUnderSampler(random_state=42)\n",
|
||
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
|
||
" return X_resampled, y_resampled\n",
|
||
"\n",
|
||
"def split_stratified_into_train_val_test(\n",
|
||
" df_input,\n",
|
||
" stratify_colname=\"y\",\n",
|
||
" frac_train=0.6,\n",
|
||
" frac_val=0.15,\n",
|
||
" frac_test=0.25,\n",
|
||
" random_state=None,\n",
|
||
"):\n",
|
||
" \"\"\"\n",
|
||
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
|
||
" following fractional ratios provided by the user, where each subset is\n",
|
||
" stratified by the values in a specific column (that is, each subset has\n",
|
||
" the same relative frequency of the values in the column). It performs this\n",
|
||
" splitting by running train_test_split() twice.\n",
|
||
"\n",
|
||
" Parameters\n",
|
||
" ----------\n",
|
||
" df_input : Pandas dataframe\n",
|
||
" Input dataframe to be split.\n",
|
||
" stratify_colname : str\n",
|
||
" The name of the column that will be used for stratification. Usually\n",
|
||
" this column would be for the label.\n",
|
||
" frac_train : float\n",
|
||
" frac_val : float\n",
|
||
" frac_test : float\n",
|
||
" The ratios with which the dataframe will be split into train, val, and\n",
|
||
" test data. The values should be expressed as float fractions and should\n",
|
||
" sum to 1.0.\n",
|
||
" random_state : int, None, or RandomStateInstance\n",
|
||
" Value to be passed to train_test_split().\n",
|
||
"\n",
|
||
" Returns\n",
|
||
" -------\n",
|
||
" df_train, df_val, df_test :\n",
|
||
" Dataframes containing the three splits.\n",
|
||
" \"\"\"\n",
|
||
"\n",
|
||
" if frac_train + frac_val + frac_test != 1.0:\n",
|
||
" raise ValueError(\n",
|
||
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
||
" % (frac_train, frac_val, frac_test)\n",
|
||
" )\n",
|
||
"\n",
|
||
" if stratify_colname not in df_input.columns:\n",
|
||
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
||
"\n",
|
||
" X = df_input # Contains all columns.\n",
|
||
" y = df_input[\n",
|
||
" [stratify_colname]\n",
|
||
" ] # Dataframe of just the column on which to stratify.\n",
|
||
"\n",
|
||
" # Split original dataframe into train and temp dataframes.\n",
|
||
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
||
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Split the temp dataframe into val and test dataframes.\n",
|
||
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
||
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
||
" df_temp,\n",
|
||
" y_temp,\n",
|
||
" stratify=y_temp,\n",
|
||
" test_size=relative_frac_test,\n",
|
||
" random_state=random_state,\n",
|
||
" )\n",
|
||
"\n",
|
||
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
||
"\n",
|
||
" return df_train, df_val, df_test\n",
|
||
"\n",
|
||
"\n",
|
||
"df = pd.read_csv(\"../data/age.csv\", nrows=100000)\n",
|
||
"df.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Такую информацию могут использовать компании связанные с историей/культурой, с GameDev-ом, с созданием кинематографа. Реальные имена могут сделать тот же фильм более историчным. "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Как бизнес-цели выделим следующие 2 варианта:\n",
|
||
" 1) GameDev. Создание игры про конкретного персонажа, живущего в конкретном временном промежутке в конкретной стране. \n",
|
||
" 2) Исследование зависимости длительности жизни от страны проживания.\n",
|
||
" "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Поскольку данные не полные, их необходимо заполнить стандартными значениями:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Id 0\n",
|
||
"Name 0\n",
|
||
"Short description 77\n",
|
||
"Gender 1985\n",
|
||
"Country 5467\n",
|
||
"Occupation 2701\n",
|
||
"Birth year 0\n",
|
||
"Death year 1\n",
|
||
"Manner of death 85179\n",
|
||
"Age of death 1\n",
|
||
"dtype: int64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(df.isnull().sum())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"Index: 99922 entries, 0 to 99999\n",
|
||
"Data columns (total 10 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 Id 99922 non-null object \n",
|
||
" 1 Name 99922 non-null object \n",
|
||
" 2 Short description 99922 non-null object \n",
|
||
" 3 Gender 99922 non-null object \n",
|
||
" 4 Country 99922 non-null object \n",
|
||
" 5 Occupation 99922 non-null object \n",
|
||
" 6 Birth year 99922 non-null int64 \n",
|
||
" 7 Death year 99922 non-null float64\n",
|
||
" 8 Manner of death 99922 non-null object \n",
|
||
" 9 Age of death 99922 non-null float64\n",
|
||
"dtypes: float64(2), int64(1), object(7)\n",
|
||
"memory usage: 8.4+ MB\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Id</th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Short description</th>\n",
|
||
" <th>Gender</th>\n",
|
||
" <th>Country</th>\n",
|
||
" <th>Occupation</th>\n",
|
||
" <th>Birth year</th>\n",
|
||
" <th>Death year</th>\n",
|
||
" <th>Manner of death</th>\n",
|
||
" <th>Age of death</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>99995</th>\n",
|
||
" <td>Q729652</td>\n",
|
||
" <td>Jacques-Joseph Moreau</td>\n",
|
||
" <td>French psychiatrist</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>France</td>\n",
|
||
" <td>Psychiatrist; psychologist</td>\n",
|
||
" <td>1804</td>\n",
|
||
" <td>1884.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>99996</th>\n",
|
||
" <td>Q729661</td>\n",
|
||
" <td>Jerome Wiesner</td>\n",
|
||
" <td>American academic engineer</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>United States of America</td>\n",
|
||
" <td>Researcher</td>\n",
|
||
" <td>1915</td>\n",
|
||
" <td>1994.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>79.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>99997</th>\n",
|
||
" <td>Q729662</td>\n",
|
||
" <td>Westmoreland Davis</td>\n",
|
||
" <td>American politician (1859-1942)</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>United States of America</td>\n",
|
||
" <td>Politician</td>\n",
|
||
" <td>1859</td>\n",
|
||
" <td>1942.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>83.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>99998</th>\n",
|
||
" <td>Q729674</td>\n",
|
||
" <td>John Needham</td>\n",
|
||
" <td>English biologist and Roman Catholic priest</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>England</td>\n",
|
||
" <td>Religious figure</td>\n",
|
||
" <td>1713</td>\n",
|
||
" <td>1810.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>97.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>99999</th>\n",
|
||
" <td>Q729679</td>\n",
|
||
" <td>Francis Bourne</td>\n",
|
||
" <td>Catholic cardinal</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>United Kingdom</td>\n",
|
||
" <td>Religious figure</td>\n",
|
||
" <td>1861</td>\n",
|
||
" <td>1934.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>73.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Id Name \\\n",
|
||
"99995 Q729652 Jacques-Joseph Moreau \n",
|
||
"99996 Q729661 Jerome Wiesner \n",
|
||
"99997 Q729662 Westmoreland Davis \n",
|
||
"99998 Q729674 John Needham \n",
|
||
"99999 Q729679 Francis Bourne \n",
|
||
"\n",
|
||
" Short description Gender \\\n",
|
||
"99995 French psychiatrist Male \n",
|
||
"99996 American academic engineer Male \n",
|
||
"99997 American politician (1859-1942) Male \n",
|
||
"99998 English biologist and Roman Catholic priest Male \n",
|
||
"99999 Catholic cardinal Male \n",
|
||
"\n",
|
||
" Country Occupation Birth year \\\n",
|
||
"99995 France Psychiatrist; psychologist 1804 \n",
|
||
"99996 United States of America Researcher 1915 \n",
|
||
"99997 United States of America Politician 1859 \n",
|
||
"99998 England Religious figure 1713 \n",
|
||
"99999 United Kingdom Religious figure 1861 \n",
|
||
"\n",
|
||
" Death year Manner of death Age of death \n",
|
||
"99995 1884.0 NaN 80.0 \n",
|
||
"99996 1994.0 NaN 79.0 \n",
|
||
"99997 1942.0 NaN 83.0 \n",
|
||
"99998 1810.0 NaN 97.0 \n",
|
||
"99999 1934.0 NaN 73.0 "
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df.fillna({\"Gender\": \"NaN\", \"Country\": \"NaN\", \"Occupation\" : \"NaN\", \"Manner of death\" : \"NaN\"}, inplace=True)\n",
|
||
"df = df.dropna()\n",
|
||
"df.info()\n",
|
||
"df.tail()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Данные приращены, удалены только те строки, в которых не было даты смерти или короткого описания"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<Axes: ylabel='Frequency'>"
|
||
]
|
||
},
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAlUAAAGdCAYAAAA7VYb2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA5oUlEQVR4nO3de1xVVeL///cRPCDGxRscGBHxnnezhmFSy9EBLx+zdD5TanljdGpwKjEzv6mp9UjDoqwspykvPbJ0/IxZo+WIl7QSNS+EWg9KU6kR1E9eTmgCwv790Y/98QgiHjZwDuf1fDz2Q/dea++99lnCebv2OvvYDMMwBAAAgCqpV9sNAAAAqAsIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAf/aboA3KCkp0YkTJxQcHCybzVbbzQEAAJVgGIZ++uknRUVFqV696h9HIlRVwokTJxQdHV3bzQAAAG74/vvv1bx582o/D6GqEoKDgyX90ikhISG13BoAAFAZTqdT0dHR5vt4dSNUVULpLb+QkBBCFQAAXqampu4wUR0AAMAChCoAAAALEKoAAAAswJwqixiGocuXL6u4uLi2mwKL1K9fX35+frXdDACAlyBUWaCwsFC5ubm6ePFibTcFFrLZbGrevLluuumm2m4KAMALEKqqqKSkREePHpWfn5+ioqJkt9t5QGgdYBiGTp8+rR9++EFt27ZlxAoAcF2EqioqLCxUSUmJoqOjFRQUVNvNgYWaNWumY8eOqaioiFAFALguJqpbpCYef4+axYgjAOBGkAQAAAAsQKhChY4dOyabzabMzEy39rfZbFq7dq2lbQIAwBMxp6oatXxifY2e79j8wTdUf+zYsVq+fLm53rhxY912221KTU1V165dJUnR0dHKzc1V06ZNKzzW7NmztXbtWrfDFwAA3o6RKh83YMAA5ebmKjc3V5s3b5a/v7/+67/+yyz38/OTw+GQv3/5+bv0+VzerrCwsLabAADwcoQqHxcQECCHwyGHw6Hu3bvriSee0Pfff6/Tp09LKnv775NPPpHNZtPHH3+snj17KiAgQO+8847mzJmjL7/8UjabTTabTcuWLTPP8b//+7+65557FBQUpLZt2+rDDz+8Znvmzp2rzp07l9nevXt3zZw501x/8803dfPNNyswMFAdOnTQa6+95lJ/2rRpateunYKCgtSqVSvNnDlTRUVFZvns2bPVvXt3vfnmm4qNjVVgYKA7Lx8AACZu/8GUn5+vd955R23atFGTJk0qrPvEE0/o+eefV6tWrRQYGKgpU6Zow4YN2rRpkyQpNDTUrDtnzhylpqZqwYIFeuWVVzRq1CgdP35cjRs3LnPc8ePHa86cOfriiy902223SZL279+vrKwsrVmzRpK0YsUKzZo1S6+++qp69Oih/fv3a8KECWrYsKHGjBkjSQoODtayZcsUFRWlAwcOaMKECQoODtbjjz9unuvw4cP65z//qTVr1vDIBABAlRGqfNy6devMJ4ZfuHBBkZGRWrdu3XUfETF37lz9/ve/N9dvuukm+fv7y+FwlKk7duxYjRgxQpL07LPP6uWXX9bu3bs1YMCAMnWbN2+uxMRELV261AxVS5cu1R133KFWrVpJkp566im98MILGjZsmCQpNjZWX331lf72t7+ZoWrGjBnmMVu2bKnHHntMK1eudAlVhYWFevvtt9WsWbPrv1AAgEpp+cT6G57jW1dw+8/H9e3bV5mZmcrMzNTu3buVmJiogQMH6vjx4xXud+utt1b6HKWT3iWpYcOGCgkJ0alTp65Zf8KECXrvvfd06dIlFRYW6t1339X48eMl/RL8jhw5oqSkJN10003m8swzz+jIkSPmMVatWqXbb79dDodDN910k2bMmKGcnByX88TExBCoAACWYaTKxzVs2FBt2rQx1998802Fhobq73//u5555pkK96us+vXru6zbbDaVlJRcs/6QIUMUEBCg999/X3a7XUVFRfrDH/4g6ZdblJL097//XXFxcS77ld7Cy8jI0KhRozRnzhwlJiYqNDRUK1eu1AsvvOD2NQAAcD2EKriw2WyqV6+efv755xvaz263q7i42JI2+Pv7a8yYMVq6dKnsdrvuu+8+NWjQQJIUERGhqKgofffddxo1alS5++/YsUMxMTF68sknzW3XG3kDAKCqCFU+rqCgQHl5eZKks2fP6tVXX1V+fr6GDBlyQ8dp2bKljh49qszMTDVv3lzBwcEKCAhwu11/+tOfdPPNN0uSPv/8c5eyOXPm6OGHH1ZoaKgGDBiggoIC7dmzR2fPnlVKSoratm2rnJwcrVy5UrfddpvWr1+v999/3+22AABQGcyp8nEbNmxQZGSkIiMjFRcXpy+++EKrV6/WnXfeeUPHGT58uAYMGKC+ffuqWbNmeu+996rUrrZt2+q3v/2tOnToUOY235/+9Ce9+eabWrp0qbp06aI77rhDy5YtU2xsrCTprrvu0uTJkzVp0iR1795dO3bscHkcAwAA1cFmGIZR243wdE6nU6GhoTp//rxCQkJcyi5duqSjR4/yrCOLGYahtm3b6i9/+YtSUlJqpQ30LQDcOE/69F9F79/VoVZHqrZv364hQ4YoKiqq3O+IK32Q5NXLggULzDotW7YsUz5//nyX42RlZal3794KDAxUdHS0UlNTa+Ly4KbTp0/r1VdfVV5ensaNG1fbzQEAoFJqdU7VhQsX1K1bN40fP9585tCVcnNzXdY//vhjJSUlafjw4S7b586dqwkTJpjrwcHB5t+dTqcSEhLUv39/LV68WAcOHND48eMVFhamiRMnWnxFsEJ4eLiaNm2qN954Q40aNart5gAAUCm1GqoGDhyogQMHXrP86gdJfvDBB+rbt6/5EMhSwcHB5T50Uvrl6duFhYVasmSJ7Ha7OnXqpMzMTKWlpRGqPBR3pAEA3shrJqqfPHlS69evV1JSUpmy+fPnq0mTJurRo4cWLFjg8gW/GRkZ6tOnj+x2u7ktMTFR2dnZOnv2bLnnKigokNPpdFkAAAAq4jWPVFi+fLmCg4PL3CZ8+OGHdcstt6hx48basWOHpk+frtzcXKWlpUmS8vLyzE+FlYqIiDDLyru9NG/ePM2ZM6eargQAANRFXhOqlixZolGjRpX5FNaVnwzr2rWr7Ha7/vznP2vevHluPydp+vTpLsd1Op2Kjo6ucB9uWdU99CkA4EZ4Raj69NNPlZ2drVWrVl23blxcnC5fvqxjx46pffv2cjgcOnnypEud0vVrzcMKCAiodCAr/QqWixcvmk/9Rt1QWFgo6f++/gYAgIp4Rah666231LNnT3Xr1u26dTMzM1WvXj2Fh4dLkuLj4/Xkk0+qqKjIDEDp6elq3769JZ8s8/PzU1hYmPkFwUFBQbLZbFU+LmpXSUmJTp8+raCgIPn7e8WPCQCgltXqu0V+fr4OHz5srpd+zUnjxo3VokULSb/celu9enWZL8OVfpmEvmvXLvXt21fBwcHKyMjQ5MmTdf/995uBaeTIkZozZ46SkpI0bdo0HTx4UAsXLtSLL75o2XWUjniVBivUDfXq1VOLFi0IyQCASqnVULVnzx717dvXXC+dxzRmzBgtW7ZMkrRy5UoZhqERI0aU2T8gIEArV67U7NmzVVBQoNjYWE2ePNllPlRoaKg2btyo5ORk9ezZU02bNtWsWbMsfZyCzWZTZGSkwsPDVVRUZNlxUbvsdrvq1fOaD8gCAGoZX1NTCTX9mHsAALwVX1MDAACAKiFUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWKBWQ9X27ds1ZMgQRUVFyWazae3atS7lY8eOlc1mc1kGDBjgUufMmTMaNWqUQkJCFBYWpqSkJOXn57vUycrKUu/evRUYGKjo6GilpqZW96UBAAAfU6uh6sKFC+rWrZsWLVp0zToDBgxQbm6uubz33nsu5aNGjdKhQ4eUnp6udevWafv27Zo4caJZ7nQ6lZCQoJiYGO3du1cLFizQ7Nmz9cYbb1TbdQEAAN/jX5snHzhwoAYOHFhhnYCAADkcjnLLvv76a23YsEFffPGFbr31VknSK6+8okGDBun5559XVFSUVqxYocLCQi1ZskR2u12dOnVSZmam0tLSXMIXAABAVXj8nKpPPvlE4eHhat++vR566CH9+OOPZllGRobCwsLMQCVJ/fv3V7169bRr1y6zTp8+fWS32806iYmJys7O1tmzZ8s9Z0FBgZxOp8sCAABQEY8OVQMGDNDbb7+tzZs367nnntO2bds0cOBAFRcXS5Ly8vIUHh7uso+/v78aN26svLw8s05ERIRLndL10jpXmzdvnkJDQ80lOjra6ksDAAB1TK3e/rue++67z/x7ly5d1LVrV7Vu3VqffPKJ+vXrV23nnT59ulJSUsx1p9NJsAIAABXy6JGqq7Vq1UpNmzbV4cOHJUkOh0OnTp1yqXP58mWdOXPGnIflcDh08uRJlzql69eaqxUQEKCQkBCXBQAAoCJeFap++OEH/fjjj4qMjJQkxcfH69y5c9q7d69ZZ8uWLSopKVFcXJxZZ/v27SoqKjLrpKenq3379mrUqFHNXgAAAKizajVU5efnKzMzU5mZmZKko0ePKjMzUzk5OcrPz9fUqVO1c+dOHTt2TJs3b9bQoUPVpk0bJSYmSpJuvvlmDRgwQBMmTNDu3bv1+eefa9KkSbrvvvsUFRUlSRo5cqTsdruSkpJ06NAhrVq1SgsXLnS5vQcAAFBVtRqq9uzZox49eqhHjx6SpJSUFPXo0UOzZs2Sn5+fsrKydNddd6ldu3ZKSkpSz5499emnnyogIMA8xooVK9ShQwf169dPgwYNUq9evVyeQRUaGqqNGzfq6NGj6tmzp6ZMmaJZs2bxOAUAAGApm2EYRm03wtM5nU6Fhobq/PnzzK8CAKACLZ9Yr2PzB9d2MyTV/Pu3V82pAgAA8FSEKgAAAAsQqgAAACxAqAIAALAAoQoAAMAChCoAAAALEKoAAAAsQKgCAAAuWj6xvrab4JUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWqNVQtX37dg0ZMkRRUVGy2Wxau3atWVZUVKRp06apS5cuatiwoaKiojR69GidOHHC5RgtW7aUzWZzWebPn+9SJysrS71791ZgYKCio6OVmppaE5cHAAB8SK2GqgsXLqhbt25atGhRmbKLFy9q3759mjlzpvbt26c1a9YoOztbd911V5m6c+fOVW5urrn89a9/NcucTqcSEhIUExOjvXv3asGCBZo9e7beeOONar02AADgW/xr8+QDBw7UwIEDyy0LDQ1Venq6y7ZXX31Vv/71r5WTk6MWLVqY24ODg+VwOMo9zooVK1RYWKglS5bIbrerU6dOyszMVFpamiZOnGjdxQAAAJ/mVXOqzp8/L5vNprCwMJft8+fPV5MmTdSjRw8tWLBAly9fNssyMjLUp08f2e12c1tiYqKys7N19uzZmmo6AACo42p1pOpGXLp0SdOmTdOIESMUEhJibn/44Yd1yy23qHHjxtqxY4emT5+u3NxcpaWlSZLy8vIUGxvrcqyIiAizrFGjRmXOVVBQoIKCAnPd6XRWxyUBAIA6xCtCVVFRkf74xz/KMAy9/vrrLmUpKSnm37t27Sq73a4///nPmjdvngICAtw637x58zRnzpwqtRkAAPgWj7/9Vxqojh8/rvT0dJdRqvLExcXp8uXLOnbsmCTJ4XDo5MmTLnVK1681D2v69Ok6f/68uXz//fdVvxAAAFCneXSoKg1U3377rTZt2qQmTZpcd5/MzEzVq1dP4eHhkqT4+Hht375dRUVFZp309HS1b9++3Ft/khQQEKCQkBCXBQAAoCK1evsvPz9fhw8fNtePHj2qzMxMNW7cWJGRkfrDH/6gffv2ad26dSouLlZeXp4kqXHjxrLb7crIyNCuXbvUt29fBQcHKyMjQ5MnT9b9999vBqaRI0dqzpw5SkpK0rRp03Tw4EEtXLhQL774Yq1cMwAAqJtqNVTt2bNHffv2NddL50eNGTNGs2fP1ocffihJ6t69u8t+W7du1Z133qmAgACtXLlSs2fPVkFBgWJjYzV58mSXeVahoaHauHGjkpOT1bNnTzVt2lSzZs3icQoAAMBStRqq7rzzThmGcc3yisok6ZZbbtHOnTuve56uXbvq008/veH2AQAAVJZHz6kCAADwFoQqAAAACxCqAAAALECoAgAAsAChCgAAwAKEKgAAAAsQqgAAACxAqAIAALAAoQoAAMAChCoAAAALEKoAAAAsQKgCAACwAKEKAADAAoQqAAAACxCqAAAALECoAgAAsAChCgAAwAKEKgAAAAsQqgAAACxAqAIAALCAW6Hqu+++s7odAAAAXs2tUNWmTRv17dtX77zzji5dumR1mwAAALyOW6Fq37596tq1q1JSUuRwOPTnP/9Zu3fvtrptAAAAXsOtUNW9e3ctXLhQJ06c0JIlS5Sbm6tevXqpc+fOSktL0+nTp61uJwAAgEer0kR1f39/DRs2TKtXr9Zzzz2nw4cP67HHHlN0dLRGjx6t3Nxcq9oJAADg0aoUqvbs2aO//OUvioyMVFpamh577DEdOXJE6enpOnHihIYOHWpVOwEAADyavzs7paWlaenSpcrOztagQYP09ttva9CgQapX75eMFhsbq2XLlqlly5ZWthUAAMBjuRWqXn/9dY0fP15jx45VZGRkuXXCw8P11ltvValxAAAA3sKtUPXtt99et47dbteYMWPcOTwAAIDXcWtO1dKlS7V69eoy21evXq3ly5dXuVEAAADexq1QNW/ePDVt2rTM9vDwcD377LNVbhQAAIC3cStU5eTkKDY2tsz2mJgY5eTkVLlRAAAA3satUBUeHq6srKwy27/88ks1adKkyo0CAADwNm6FqhEjRujhhx/W1q1bVVxcrOLiYm3ZskWPPPKI7rvvPqvbCAAA4PHc+vTf008/rWPHjqlfv37y9//lECUlJRo9ejRzqgAAgE9yK1TZ7XatWrVKTz/9tL788ks1aNBAXbp0UUxMjNXtAwAA8ApuhapS7dq1U7t27axqCwAAgNdyK1QVFxdr2bJl2rx5s06dOqWSkhKX8i1btljSOAAAAG/h1kT1Rx55RI888oiKi4vVuXNndevWzWWprO3bt2vIkCGKioqSzWbT2rVrXcoNw9CsWbMUGRmpBg0aqH///mWe5n7mzBmNGjVKISEhCgsLU1JSkvLz813qZGVlqXfv3goMDFR0dLRSU1PduWwAAIBrcmukauXKlfrHP/6hQYMGVenkFy5cULdu3TR+/HgNGzasTHlqaqpefvllLV++XLGxsZo5c6YSExP11VdfKTAwUJI0atQo5ebmKj09XUVFRRo3bpwmTpyod999V5LkdDqVkJCg/v37a/HixTpw4IDGjx+vsLAwTZw4sUrtBwAAKOX2RPU2bdpU+eQDBw7UwIEDyy0zDEMvvfSSZsyYoaFDh0qS3n77bUVERGjt2rW677779PXXX2vDhg364osvdOutt0qSXnnlFQ0aNEjPP/+8oqKitGLFChUWFmrJkiWy2+3q1KmTMjMzlZaWRqgCAACWcev235QpU7Rw4UIZhmF1e0xHjx5VXl6e+vfvb24LDQ1VXFycMjIyJEkZGRkKCwszA5Uk9e/fX/Xq1dOuXbvMOn369JHdbjfrJCYmKjs7W2fPni333AUFBXI6nS4LAABARdwaqfrss8+0detWffzxx+rUqZPq16/vUr5mzZoqNywvL0+SFBER4bI9IiLCLMvLy1N4eLhLub+/vxo3buxS5+qv1Ck9Zl5enho1alTm3PPmzdOcOXOqfA0AAMB3uBWqwsLCdM8991jdFo8xffp0paSkmOtOp1PR0dG12CIAAODp3ApVS5cutbodZTgcDknSyZMnFRkZaW4/efKkunfvbtY5deqUy36XL1/WmTNnzP0dDodOnjzpUqd0vbTO1QICAhQQEGDJdQAAAN/g1pwq6ZfwsmnTJv3tb3/TTz/9JEk6ceJEmccZuCs2NlYOh0ObN282tzmdTu3atUvx8fGSpPj4eJ07d0579+4162zZskUlJSWKi4sz62zfvl1FRUVmnfT0dLVv377cW38AAADucCtUHT9+XF26dNHQoUOVnJys06dPS5Kee+45PfbYY5U+Tn5+vjIzM5WZmSnpl8npmZmZysnJkc1m06OPPqpnnnlGH374oQ4cOKDRo0crKipKd999tyTp5ptv1oABAzRhwgTt3r1bn3/+uSZNmqT77rtPUVFRkqSRI0fKbrcrKSlJhw4d0qpVq7Rw4UKX23sAAABV5dbtv0ceeUS33nqrvvzySzVp0sTcfs8992jChAmVPs6ePXvUt29fc7006IwZM0bLli3T448/rgsXLmjixIk6d+6cevXqpQ0bNpjPqJKkFStWaNKkSerXr5/q1aun4cOH6+WXXzbLQ0NDtXHjRiUnJ6tnz55q2rSpZs2axeMUAACApWyGG89FaNKkiXbs2KH27dsrODhYX375pVq1aqVjx46pY8eOunjxYnW0tdY4nU6Fhobq/PnzCgkJqe3mAABQrVo+sV7H5g+u8X2tVtPv327d/ispKVFxcXGZ7T/88IOCg4Or3CgAAABv41aoSkhI0EsvvWSu22w25efn66mnnqryV9cAAAB4I7fmVL3wwgtKTExUx44ddenSJY0cOVLffvutmjZtqvfee8/qNgIAAC/Q8on1td2EWuVWqGrevLm+/PJLrVy5UllZWcrPz1dSUpJGjRqlBg0aWN1GAAAAj+dWqJJ++TqY+++/38q2AAAAeC23QtXbb79dYfno0aPdagwAAIC3cvs5VVcqKirSxYsXZbfbFRQURKgCAMBHeNIjFGqbW5/+O3v2rMuSn5+v7Oxs9erVi4nqAADAJ7n93X9Xa9u2rebPn19mFAsAAMAXWBaqpF8mr584ccLKQwIAAA/n649SKOXWnKoPP/zQZd0wDOXm5urVV1/V7bffbknDAAAAvIlboeruu+92WbfZbGrWrJl+97vf6YUXXrCiXQAAAF7FrVBVUlJidTsAAAC8mqVzqgAAgG9iXpWbI1UpKSmVrpuWlubOKQAAALyKW6Fq//792r9/v4qKitS+fXtJ0jfffCM/Pz/dcsstZj2bzWZNKwEAADycW6FqyJAhCg4O1vLly9WoUSNJvzwQdNy4cerdu7emTJliaSMBAAA8nVtzql544QXNmzfPDFSS1KhRIz3zzDN8+g8AAPgkt0KV0+nU6dOny2w/ffq0fvrppyo3CgAAwNu4FaruuecejRs3TmvWrNEPP/ygH374Qf/85z+VlJSkYcOGWd1GAADgASr7CT9f/SSgW3OqFi9erMcee0wjR45UUVHRLwfy91dSUpIWLFhgaQMBAAC8gVuhKigoSK+99poWLFigI0eOSJJat26thg0bWto4AAAAb1Glh3/m5uYqNzdXbdu2VcOGDWUYhlXtAgAA8Cpuhaoff/xR/fr1U7t27TRo0CDl5uZKkpKSknicAgAAHsBX5zXVJrdC1eTJk1W/fn3l5OQoKCjI3H7vvfdqw4YNljUOAADUDkLZjXNrTtXGjRv173//W82bN3fZ3rZtWx0/ftyShgEAAHgTt0aqLly44DJCVerMmTMKCAiocqMAAAC8jVuhqnfv3nr77bfNdZvNppKSEqWmpqpv376WNQ4AANSelk+sNxdcn1u3/1JTU9WvXz/t2bNHhYWFevzxx3Xo0CGdOXNGn3/+udVtBAAAHqTlE+t1bP7g2m6Gx3FrpKpz58765ptv1KtXLw0dOlQXLlzQsGHDtH//frVu3drqNgIAAA/BqNW13fBIVVFRkQYMGKDFixfrySefrI42AQAAeJ0bHqmqX7++srKyqqMtAADAQzFCdX1u3f67//779dZbb1ndFgAAAK/l1kT1y5cva8mSJdq0aZN69uxZ5jv/0tLSLGkcAACAt7ihUPXdd9+pZcuWOnjwoG655RZJ0jfffONSx2azWdc6AAAAL3FDoapt27bKzc3V1q1bJf3ytTQvv/yyIiIiqqVxAAAA3uKG5lQZhuGy/vHHH+vChQuWNggAAMAbuTVRvdTVIQsAAMBX3VCostlsZeZMVfccqpYtW5rnvXJJTk6WJN15551lyh588EGXY+Tk5Gjw4MEKCgpSeHi4pk6dqsuXL1druwEAgG+5oTlVhmFo7Nix5pcmX7p0SQ8++GCZT/+tWbPGsgZ+8cUXKi4uNtcPHjyo3//+9/rv//5vc9uECRM0d+5cc/3KL3suLi7W4MGD5XA4tGPHDuXm5mr06NGqX7++nn32WcvaCQAAfNsNhaoxY8a4rN9///2WNqY8zZo1c1mfP3++WrdurTvuuMPcFhQUJIfDUe7+Gzdu1FdffaVNmzYpIiJC3bt319NPP61p06Zp9uzZstvt1dp+AADgG24oVC1durS62lEphYWFeuedd5SSkuJy23HFihV655135HA4NGTIEM2cOdMcrcrIyFCXLl1cPqGYmJiohx56SIcOHVKPHj3KnKegoEAFBQXmutPprMarAgAAdYFbD/+sLWvXrtW5c+c0duxYc9vIkSMVExOjqKgoZWVladq0acrOzjZvQebl5ZV55EPpel5eXrnnmTdvnubMmVM9FwEAAOokrwpVb731lgYOHKioqChz28SJE82/d+nSRZGRkerXr5+OHDmi1q1bu3We6dOnKyUlxVx3Op2Kjo52v+EAAKDO85pQdfz4cW3atOm6k+Dj4uIkSYcPH1br1q3lcDi0e/dulzonT56UpGvOwwoICDAn4wMAAFRGlZ5TVZOWLl2q8PBwDR48uMJ6mZmZkqTIyEhJUnx8vA4cOKBTp06ZddLT0xUSEqKOHTtWW3sBAIBv8YqRqpKSEi1dulRjxoyRv///NfnIkSN69913NWjQIDVp0kRZWVmaPHmy+vTpo65du0qSEhIS1LFjRz3wwANKTU1VXl6eZsyYoeTkZEajAACAZbwiVG3atEk5OTkaP368y3a73a5NmzbppZde0oULFxQdHa3hw4drxowZZh0/Pz+tW7dODz30kOLj49WwYUONGTPG5blWAAAAVeUVoSohIaHcr8SJjo7Wtm3brrt/TEyMPvroo+poGgAAgCQvmlMFAADgyQhVAAAAFiBUAQAAU8sn1td2E7wWoQoAAMAChCoAAAALEKoAAAAsQKgCAACW88W5WYQqAAAACxCqAAAALECoAgAAN8wXb+9dD6EKAADAAoQqAAAACxCqAAAALECoAgAAbmFelStCFQAAPurKUERAqjpCFQAAXoog5FkIVQAA+DjCmTUIVQAAQBLhqqoIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAIAP4vEJ1iNUAQAAWIBQBQCAD2PEyjqEKgAAAAsQqgAAQLXwtVEwQhUAAIAFCFUAAAAWIFQBAABYwL+2GwAAAGqOr81zqkmMVAEAAFiAUAUAAGABQhUAAIAFCFUAAKDa+NIcLkIVAACABTw6VM2ePVs2m81l6dChg1l+6dIlJScnq0mTJrrppps0fPhwnTx50uUYOTk5Gjx4sIKCghQeHq6pU6fq8uXLNX0pAACgjvPoUCVJnTp1Um5urrl89tlnZtnkyZP1r3/9S6tXr9a2bdt04sQJDRs2zCwvLi7W4MGDVVhYqB07dmj58uVatmyZZs2aVRuXAgCAT/KVW4Ae/5wqf39/ORyOMtvPnz+vt956S++++65+97vfSZKWLl2qm2++WTt37tRvfvMbbdy4UV999ZU2bdqkiIgIde/eXU8//bSmTZum2bNny2631/TlAACAOsrjR6q+/fZbRUVFqVWrVho1apRycnIkSXv37lVRUZH69+9v1u3QoYNatGihjIwMSVJGRoa6dOmiiIgIs05iYqKcTqcOHTp0zXMWFBTI6XS6LAAAABXx6FAVFxenZcuWacOGDXr99dd19OhR9e7dWz/99JPy8vJkt9sVFhbmsk9ERITy8vIkSXl5eS6BqrS8tOxa5s2bp9DQUHOJjo629sIAAECd49G3/wYOHGj+vWvXroqLi1NMTIz+8Y9/qEGDBtV23unTpyslJcVcdzqdBCsAAFAhjx6pulpYWJjatWunw4cPy+FwqLCwUOfOnXOpc/LkSXMOlsPhKPNpwNL18uZplQoICFBISIjLAgAAUBGvClX5+fk6cuSIIiMj1bNnT9WvX1+bN282y7Ozs5WTk6P4+HhJUnx8vA4cOKBTp06ZddLT0xUSEqKOHTvWePsBALiar3wyzhd49O2/xx57TEOGDFFMTIxOnDihp556Sn5+fhoxYoRCQ0OVlJSklJQUNW7cWCEhIfrrX/+q+Ph4/eY3v5EkJSQkqGPHjnrggQeUmpqqvLw8zZgxQ8nJyQoICKjlqwMAAHWJR4eqH374QSNGjNCPP/6oZs2aqVevXtq5c6eaNWsmSXrxxRdVr149DR8+XAUFBUpMTNRrr71m7u/n56d169bpoYceUnx8vBo2bKgxY8Zo7ty5tXVJAACgjvLoULVy5coKywMDA7Vo0SItWrTomnViYmL00UcfWd00AAAAF141pwoAAMBTEaoAAPARTIqvXoQqAAAACxCqAAAALECoAgDAB3Drr/oRqgAAACxAqAIAoI7zhFEqT2hDdSNUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAHVIyyfWm48v8IXHGHgSQhUAAKgRdT3kEaoAAAAsQKgCAKAOq+ujQ56EUAUAAGABQhUAAIAFCFUAANQR3OqrXYQqAAAACxCqAABAjanLo2mEKgAAAAsQqgAA8HJ1efTHmxCqAABAjaqrIZBQBQBAHVBXg4o3IVQBAABYgFAFAABgAUIVAABejNt+noNQBQAAYAFCFQAAgAUIVQAAeCFu+3keQhUAAF6g5RPrzSBFoPJMhCoAAAALEKoAAKgiRo4gEaoAAPAqBDjPRagCAACwgEeHqnnz5um2225TcHCwwsPDdffddys7O9ulzp133imbzeayPPjggy51cnJyNHjwYAUFBSk8PFxTp07V5cuXa/JSAAC4IVeOSDE65R08OlRt27ZNycnJ2rlzp9LT01VUVKSEhARduHDBpd6ECROUm5trLqmpqWZZcXGxBg8erMLCQu3YsUPLly/XsmXLNGvWrJq+HAAAXPhyWKqL1+5f2w2oyIYNG1zWly1bpvDwcO3du1d9+vQxtwcFBcnhcJR7jI0bN+qrr77Spk2bFBERoe7du+vpp5/WtGnTNHv2bNnt9mq9BgAAylMXQ4Wv8+iRqqudP39ektS4cWOX7StWrFDTpk3VuXNnTZ8+XRcvXjTLMjIy1KVLF0VERJjbEhMT5XQ6dejQoZppOAAAlXTl86h8jbdft0ePVF2ppKREjz76qG6//XZ17tzZ3D5y5EjFxMQoKipKWVlZmjZtmrKzs7VmzRpJUl5enkugkmSu5+XllXuugoICFRQUmOtOp9PqywEAwNTyifU6Nn/wNcvgHbwmVCUnJ+vgwYP67LPPXLZPnDjR/HuXLl0UGRmpfv366ciRI2rdurVb55o3b57mzJlTpfYCAFBVBCrv4hW3/yZNmqR169Zp69atat68eYV14+LiJEmHDx+WJDkcDp08edKlTun6teZhTZ8+XefPnzeX77//vqqXAADwcQSkus+jQ5VhGJo0aZLef/99bdmyRbGxsdfdJzMzU5IUGRkpSYqPj9eBAwd06tQps056erpCQkLUsWPHco8REBCgkJAQlwUAAHdd+Z19hKu6y6Nv/yUnJ+vdd9/VBx98oODgYHMOVGhoqBo0aKAjR47o3Xff1aBBg9SkSRNlZWVp8uTJ6tOnj7p27SpJSkhIUMeOHfXAAw8oNTVVeXl5mjFjhpKTkxUQEFCblwcA8FEEq7rJo0eqXn/9dZ0/f1533nmnIiMjzWXVqlWSJLvdrk2bNikhIUEdOnTQlClTNHz4cP3rX/8yj+Hn56d169bJz89P8fHxuv/++zV69GjNnTu3ti4LAIByEba8m0ePVBmGUWF5dHS0tm3bdt3jxMTE6KOPPrKqWQAAXFdFn+hD3eTRI1UAAHgjRpwqp669ToQqAACqUWWDQ10LGJVVl66bUAUAgKrnzb0uBQZcH6EKAADAAoQqAAAACxCqAACwCLf7fBuhCgAAwAKEKgAALMRole8iVAEAYAHCFAhVAAAAFiBUAfB5jDAAsAKhCgAAwAKEKgAAAAsQqgAAACxAqAIAoAqYk4dShCoAANxAmLJOXXktCVUAgDqjpt+c60oYgDUIVQAAlIPAVLPqwutNqAIA+KxrvZFfub0uvNmjZhCqAAA+qTJhqbxw1fKJ9QQtlItQBQDwORWFIgIT3EWoAgB4laqGHncC1ZWjVMC1EKoAAD6BQITqRqgCAPg8Apfn8OY5a4QqAIBP89Y38LrOG/uFUAUAAGABQhW8jjf+7wWAq+r4OeZ3A2oboQqAR+ONsnp5++tbXvuv9Wypa9WHZ/OmPiNUAaizvOmX8fXUpWupKQQp1DRCFQCfUlufLKrKOT01FHhquyTPbhsqz9v6kVAFS3nbDwBQVTfyb74u/nzUZlhkJAqehlAFwOvU5puor57bXTcafK43RwrwZIQqAECtcic0Xb0PwQuegFAFlINf0N6nJvqsJv9d1ORttfI+LVfRscqbl+buftdr05V/evOTtlF13tD/hCoAddL1fvl62i9nd9tzrbBx5baK/l7ZNlWmfVe341rncTeowXd5y78FQhWASqmJkZPq/sVp1fGt+h/z9YJEReHjWm2o7PEq2tedNgE15UbDfk0iVMFj1NaniDzth7Ku8ZbX90ZGcMrbt6LjubO/O6r756A6roeQBnd46r8RQhVqRV2b/3K983rqL4DqYOX/ImszbFw9InSjo0JXH+fqOpUdKfKlfzvAjfDEnw2fClWLFi1Sy5YtFRgYqLi4OO3evbu2m4Qa5ukjWp4SQjxBVUc7KlPHynk8NzLK5Q395A1tBCTP+hCDz4SqVatWKSUlRU899ZT27dunbt26KTExUadOnXL7mLU1IuHuLYqqnMsXVfcPaUXHvlbZjW6/sry6AqW7o0BX/r2i9av3uV4IutbIUE3y5Z8bwJf5TKhKS0vThAkTNG7cOHXs2FGLFy9WUFCQlixZUqXjVvV/tdV9nqv3vd6bo5XXcyPbrle3ojBh1YRhK/cr739OlQ0FVeXuv63KhPUr67hzHRUFnsqe91rHqmwbANRdtf07wGYYhlGrLagBhYWFCgoK0v/8z//o7rvvNrePGTNG586d0wcffOBSv6CgQAUFBeb6+fPn1aJFC33//fcKCQlR56f+7VL/4JxEdX7q3+X+KcmsX7p+5bYrj1Fe2ZXHuPp4V567PNcqu3p7efUqs6289txIOyrjRvetTP2qtMdq7vTfjR77yuNYce2e9PoBQHlK35+cTqeio6N17tw5hYaGVv+JDR/wn//8x5Bk7Nixw2X71KlTjV//+tdl6j/11FOGJBYWFhYWFpY6sBw5cqRG8oa/UMb06dOVkpJirp87d04xMTHKycmpmaSLCpX+z6N05BC1h77wHPSF56AvPEfpnabGjRvXyPl8IlQ1bdpUfn5+OnnypMv2kydPyuFwlKkfEBCggICAMttDQ0P5AfEgISEh9IeHoC88B33hOegLz1GvXs1MIfeJiep2u109e/bU5s2bzW0lJSXavHmz4uPja7FlAACgrvCJkSpJSklJ0ZgxY3Trrbfq17/+tV566SVduHBB48aNq+2mAQCAOsBnQtW9996r06dPa9asWcrLy1P37t21YcMGRUREXHffgIAAPfXUU+XeEkTNoz88B33hOegLz0FfeI6a7gufeKQCAABAdfOJOVUAAADVjVAFAABgAUIVAACABQhVAAAAFvCZULV9+3YNGTJEUVFRstlsWrt2rUu5YRiaNWuWIiMj1aBBA/Xv31/ffvutS50zZ85o1KhRCgkJUVhYmJKSkpSfn+9SJysrS71791ZgYKCio6OVmppa3ZfmlSrqj6KiIk2bNk1dunRRw4YNFRUVpdGjR+vEiRMux6A/rHG9n40rPfjgg7LZbHrppZdcttMX1qhMX3z99de66667FBoaqoYNG+q2225TTk6OWX7p0iUlJyerSZMmuummmzR8+PAyDz7OycnR4MGDFRQUpPDwcE2dOlWXL1+u7svzKtfri/z8fE2aNEnNmzdXgwYN1LFjRy1evNilDn1hjXnz5um2225TcHCwwsPDdffddys7O9uljlWv9SeffKJbbrlFAQEBatOmjZYtW3ZDbfWZUHXhwgV169ZNixYtKrc8NTVVL7/8shYvXqxdu3apYcOGSkxM1KVLl8w6o0aN0qFDh5Senq5169Zp+/btmjhxolnudDqVkJCgmJgY7d27VwsWLNDs2bP1xhtvVPv1eZuK+uPixYvat2+fZs6cqX379mnNmjXKzs7WXXfd5VKP/rDG9X42Sr3//vvauXOnoqKiypTRF9a4Xl8cOXJEvXr1UocOHfTJJ58oKytLM2fOVGBgoFln8uTJ+te//qXVq1dr27ZtOnHihIYNG2aWFxcXa/DgwSosLNSOHTu0fPlyLVu2TLNmzar26/Mm1+uLlJQUbdiwQe+8846+/vprPfroo5o0aZI+/PBDsw59YY1t27YpOTlZO3fuVHp6uoqKipSQkKALFy6Ydax4rY8eParBgwerb9++yszM1KOPPqo//elP+ve/b+AL5GvkGwY9jCTj/fffN9dLSkoMh8NhLFiwwNx27tw5IyAgwHjvvfcMwzCMr776ypBkfPHFF2adjz/+2LDZbMZ//vMfwzAM47XXXjMaNWpkFBQUmHWmTZtmtG/fvpqvyLtd3R/l2b17tyHJOH78uGEY9Ed1uVZf/PDDD8avfvUr4+DBg0ZMTIzx4osvmmX0RfUory/uvfde4/7777/mPufOnTPq169vrF692tz29ddfG5KMjIwMwzAM46OPPjLq1atn5OXlmXVef/11IyQkxKV/8H/K64tOnToZc+fOddl2yy23GE8++aRhGPRFdTp16pQhydi2bZthGNa91o8//rjRqVMnl3Pde++9RmJiYqXb5jMjVRU5evSo8vLy1L9/f3NbaGio4uLilJGRIUnKyMhQWFiYbr31VrNO//79Va9ePe3atcus06dPH9ntdrNOYmKisrOzdfbs2Rq6mrrp/PnzstlsCgsLk0R/1KSSkhI98MADmjp1qjp16lSmnL6oGSUlJVq/fr3atWunxMREhYeHKy4uzuW21N69e1VUVOTyu6xDhw5q0aKFy++yLl26uDz4ODExUU6nU4cOHaqx6/F2v/3tb/Xhhx/qP//5jwzD0NatW/XNN98oISFBEn1Rnc6fPy9J5pckW/VaZ2RkuByjtE7pMSqDUCUpLy9Pkso8XT0iIsIsy8vLU3h4uEu5v7+/Gjdu7FKnvGNceQ7cuEuXLmnatGkaMWKE+eWk9EfNee655+Tv76+HH3643HL6omacOnVK+fn5mj9/vgYMGKCNGzfqnnvu0bBhw7Rt2zZJv7yWdrvd/M9Hqat/l9EXVffKK6+oY8eOat68uex2uwYMGKBFixapT58+kuiL6lJSUqJHH31Ut99+uzp37izJutf6WnWcTqd+/vnnSrXPZ76mBt6pqKhIf/zjH2UYhl5//fXabo7P2bt3rxYuXKh9+/bJZrPVdnN8WklJiSRp6NChmjx5siSpe/fu2rFjhxYvXqw77rijNpvnc1555RXt3LlTH374oWJiYrR9+3YlJycrKiqqzGgHrJOcnKyDBw/qs88+q+2mlIuRKkkOh0OSynxS4OTJk2aZw+HQqVOnXMovX76sM2fOuNQp7xhXngOVVxqojh8/rvT0dHOUSqI/asqnn36qU6dOqUWLFvL395e/v7+OHz+uKVOmqGXLlpLoi5rStGlT+fv7q2PHji7bb775ZvPTfw6HQ4WFhTp37pxLnat/l9EXVfPzzz/r//2//6e0tDQNGTJEXbt21aRJk3Tvvffq+eefl0RfVIdJkyZp3bp12rp1q5o3b25ut+q1vladkJAQNWjQoFJtJFRJio2NlcPh0ObNm81tTqdTu3btUnx8vCQpPj5e586d0969e806W7ZsUUlJieLi4sw627dvV1FRkVknPT1d7du3V6NGjWroauqG0kD17bffatOmTWrSpIlLOf1RMx544AFlZWUpMzPTXKKiojR16lTzEzH0Rc2w2+267bbbynyU/JtvvlFMTIwkqWfPnqpfv77L77Ls7Gzl5OS4/C47cOCASxAu/U/L1YEN5SsqKlJRUZHq1XN9C/Xz8zNHFOkL6xiGoUmTJun999/Xli1bFBsb61Ju1WsdHx/vcozSOqXHqGxjfcJPP/1k7N+/39i/f78hyUhLSzP2799vfpps/vz5RlhYmPHBBx8YWVlZxtChQ43Y2Fjj559/No8xYMAAo0ePHsauXbuMzz77zGjbtq0xYsQIs/zcuXNGRESE8cADDxgHDx40Vq5caQQFBRl/+9vfavx6PV1F/VFYWGjcddddRvPmzY3MzEwjNzfXXK78RAz9YY3r/Wxc7epP/xkGfWGV6/XFmjVrjPr16xtvvPGG8e233xqvvPKK4efnZ3z66afmMR588EGjRYsWxpYtW4w9e/YY8fHxRnx8vFl++fJlo3PnzkZCQoKRmZlpbNiwwWjWrJkxffr0Gr9eT3a9vrjjjjuMTp06GVu3bjW+++47Y+nSpUZgYKDx2muvmcegL6zx0EMPGaGhocYnn3zi8n5w8eJFs44Vr/V3331nBAUFGVOnTjW+/vprY9GiRYafn5+xYcOGSrfVZ0LV1q1bDUllljFjxhiG8ctjFWbOnGlEREQYAQEBRr9+/Yzs7GyXY/z444/GiBEjjJtuuskICQkxxo0bZ/z0008udb788kujV69eRkBAgPGrX/3KmD9/fk1dolepqD+OHj1abpkkY+vWreYx6A9rXO9n42rlhSr6whqV6Yu33nrLaNOmjREYGGh069bNWLt2rcsxfv75Z+Mvf/mL0ahRIyMoKMi45557jNzcXJc6x44dMwYOHGg0aNDAaNq0qTFlyhSjqKioJi7Ra1yvL3Jzc42xY8caUVFRRmBgoNG+fXvjhRdeMEpKSsxj0BfWuNb7wdKlS806Vr3WW7duNbp3727Y7XajVatWLueoDNv/32AAAABUAXOqAAAALECoAgAAsAChCgAAwAKEKgAAAAsQqgAAACxAqAIAALAAoQoAAMAChCoAAAALEKoAAAAsQKgCAACwAKEKAADAAoQqAAAAC/x/L/USoZWJ/b0AAAAASUVORK5CYII=",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"df.plot.hist(column=[\"Birth year\"], xlim=(1000, 2000), bins=4000)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Помимо этого обработаем колонку страны таким образом, что каждый человек, который жил не в одной стране, будет занимать более одной строки, в соответствии с количеством стран в которых он жил."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"Index: 116555 entries, 0 to 99999\n",
|
||
"Data columns (total 10 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 Id 116555 non-null object \n",
|
||
" 1 Name 116555 non-null object \n",
|
||
" 2 Short description 116555 non-null object \n",
|
||
" 3 Gender 116555 non-null object \n",
|
||
" 4 Country 116555 non-null object \n",
|
||
" 5 Occupation 116555 non-null object \n",
|
||
" 6 Birth year 116555 non-null int64 \n",
|
||
" 7 Death year 116555 non-null float64\n",
|
||
" 8 Manner of death 116555 non-null object \n",
|
||
" 9 Age of death 116555 non-null float64\n",
|
||
"dtypes: float64(2), int64(1), object(7)\n",
|
||
"memory usage: 9.8+ MB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df['Country'] = df['Country'].str.split('; ')\n",
|
||
"df = df.explode('Country')\n",
|
||
"df.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Далее выполним разбиение на обучающую, контрольную и тестовую выборки."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"121\n",
|
||
"Обучающая выборка: (67038, 10)\n",
|
||
"Country\n",
|
||
"Germany 15128\n",
|
||
"United States of America 8946\n",
|
||
"France 4715\n",
|
||
"NaN 3248\n",
|
||
"United Kingdom 2796\n",
|
||
" ... \n",
|
||
"Song dynasty 32\n",
|
||
"Paraguay 31\n",
|
||
"Kingdom of Sardinia 31\n",
|
||
"Confederation of the Rhine 30\n",
|
||
"Kingdom of Saxony 30\n",
|
||
"Name: count, Length: 121, dtype: int64\n",
|
||
"Контрольная выборка: (22346, 10)\n",
|
||
"Country\n",
|
||
"Germany 5043\n",
|
||
"United States of America 2982\n",
|
||
"France 1572\n",
|
||
"NaN 1082\n",
|
||
"United Kingdom 932\n",
|
||
" ... \n",
|
||
"Vietnam 11\n",
|
||
"Paraguay 10\n",
|
||
"Kingdom of Saxony 10\n",
|
||
"Confederation of the Rhine 10\n",
|
||
"Kingdom of Sardinia 10\n",
|
||
"Name: count, Length: 121, dtype: int64\n",
|
||
"Тестовая выборка: (22347, 10)\n",
|
||
"Country\n",
|
||
"Germany 5043\n",
|
||
"United States of America 2982\n",
|
||
"France 1572\n",
|
||
"NaN 1083\n",
|
||
"United Kingdom 933\n",
|
||
" ... \n",
|
||
"England 11\n",
|
||
"Confederation of the Rhine 10\n",
|
||
"Paraguay 10\n",
|
||
"Kingdom of Sardinia 10\n",
|
||
"Kingdom of Saxony 10\n",
|
||
"Name: count, Length: 121, dtype: int64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"data = df.copy()\n",
|
||
"\n",
|
||
"value_counts = data[\"Country\"].value_counts()\n",
|
||
"rare = value_counts[value_counts < 50].index\n",
|
||
"data = data[~data[\"Country\"].isin(rare)]\n",
|
||
"\n",
|
||
"print(len(data[\"Country\"].unique()))\n",
|
||
"\n",
|
||
" \n",
|
||
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
|
||
" data, stratify_colname=\"Country\", frac_train=0.60, frac_val=0.20, frac_test=0.20)\n",
|
||
"\n",
|
||
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
||
"print(df_train[\"Country\"].value_counts())\n",
|
||
"\n",
|
||
"print(\"Контрольная выборка: \", df_val.shape)\n",
|
||
"print(df_val[\"Country\"].value_counts())\n",
|
||
"\n",
|
||
"print(\"Тестовая выборка: \", df_test.shape)\n",
|
||
"print(df_test[\"Country\"].value_counts())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"В данных были удалены строки, у которых были \"редкие\" страны. Данные наращивать не будем, поскольку в этом нет необходимости\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Выполним конструирование признаков. "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Начнем с унитарного кодирования категориальных признаков. Под этот пункт подходит столбец страна"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country_Albania</th>\n",
|
||
" <th>Country_Argentina</th>\n",
|
||
" <th>Country_Australia</th>\n",
|
||
" <th>Country_Austria</th>\n",
|
||
" <th>Country_Austria-Hungary</th>\n",
|
||
" <th>Country_Austrian Empire</th>\n",
|
||
" <th>Country_Belgium</th>\n",
|
||
" <th>Country_Bolivia</th>\n",
|
||
" <th>Country_Brazil</th>\n",
|
||
" <th>Country_British Raj</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>Country_United Kingdom of Great Britain and Ireland</th>\n",
|
||
" <th>Country_United States of America</th>\n",
|
||
" <th>Country_Uruguay</th>\n",
|
||
" <th>Country_Venezuela</th>\n",
|
||
" <th>Country_Vietnam</th>\n",
|
||
" <th>Country_Wales</th>\n",
|
||
" <th>Country_Weimar Republic</th>\n",
|
||
" <th>Country_West Germany</th>\n",
|
||
" <th>Country_Yugoslavia</th>\n",
|
||
" <th>Country_ancient Rome</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>111726</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>111727</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>111728</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>111729</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>111730</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>111731 rows × 120 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country_Albania Country_Argentina Country_Australia \\\n",
|
||
"0 0.0 0.0 0.0 \n",
|
||
"1 0.0 0.0 0.0 \n",
|
||
"2 0.0 0.0 0.0 \n",
|
||
"3 0.0 0.0 0.0 \n",
|
||
"4 0.0 0.0 0.0 \n",
|
||
"... ... ... ... \n",
|
||
"111726 0.0 0.0 0.0 \n",
|
||
"111727 0.0 0.0 0.0 \n",
|
||
"111728 0.0 0.0 0.0 \n",
|
||
"111729 0.0 0.0 0.0 \n",
|
||
"111730 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
" Country_Austria Country_Austria-Hungary Country_Austrian Empire \\\n",
|
||
"0 0.0 0.0 0.0 \n",
|
||
"1 0.0 0.0 0.0 \n",
|
||
"2 0.0 0.0 0.0 \n",
|
||
"3 0.0 0.0 0.0 \n",
|
||
"4 0.0 0.0 0.0 \n",
|
||
"... ... ... ... \n",
|
||
"111726 0.0 0.0 0.0 \n",
|
||
"111727 0.0 0.0 0.0 \n",
|
||
"111728 0.0 0.0 0.0 \n",
|
||
"111729 0.0 0.0 0.0 \n",
|
||
"111730 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
" Country_Belgium Country_Bolivia Country_Brazil Country_British Raj \\\n",
|
||
"0 0.0 0.0 0.0 0.0 \n",
|
||
"1 0.0 0.0 0.0 0.0 \n",
|
||
"2 0.0 0.0 0.0 0.0 \n",
|
||
"3 0.0 0.0 0.0 0.0 \n",
|
||
"4 0.0 0.0 0.0 0.0 \n",
|
||
"... ... ... ... ... \n",
|
||
"111726 0.0 0.0 0.0 0.0 \n",
|
||
"111727 0.0 0.0 0.0 0.0 \n",
|
||
"111728 0.0 0.0 0.0 0.0 \n",
|
||
"111729 0.0 0.0 0.0 0.0 \n",
|
||
"111730 0.0 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
" ... Country_United Kingdom of Great Britain and Ireland \\\n",
|
||
"0 ... 0.0 \n",
|
||
"1 ... 0.0 \n",
|
||
"2 ... 0.0 \n",
|
||
"3 ... 0.0 \n",
|
||
"4 ... 0.0 \n",
|
||
"... ... ... \n",
|
||
"111726 ... 0.0 \n",
|
||
"111727 ... 0.0 \n",
|
||
"111728 ... 0.0 \n",
|
||
"111729 ... 0.0 \n",
|
||
"111730 ... 0.0 \n",
|
||
"\n",
|
||
" Country_United States of America Country_Uruguay Country_Venezuela \\\n",
|
||
"0 1.0 0.0 0.0 \n",
|
||
"1 0.0 0.0 0.0 \n",
|
||
"2 0.0 0.0 0.0 \n",
|
||
"3 1.0 0.0 0.0 \n",
|
||
"4 0.0 0.0 0.0 \n",
|
||
"... ... ... ... \n",
|
||
"111726 0.0 0.0 0.0 \n",
|
||
"111727 1.0 0.0 0.0 \n",
|
||
"111728 1.0 0.0 0.0 \n",
|
||
"111729 0.0 0.0 0.0 \n",
|
||
"111730 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
" Country_Vietnam Country_Wales Country_Weimar Republic \\\n",
|
||
"0 0.0 0.0 0.0 \n",
|
||
"1 0.0 0.0 0.0 \n",
|
||
"2 0.0 0.0 0.0 \n",
|
||
"3 0.0 0.0 0.0 \n",
|
||
"4 0.0 0.0 0.0 \n",
|
||
"... ... ... ... \n",
|
||
"111726 0.0 0.0 0.0 \n",
|
||
"111727 0.0 0.0 0.0 \n",
|
||
"111728 0.0 0.0 0.0 \n",
|
||
"111729 0.0 0.0 0.0 \n",
|
||
"111730 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
" Country_West Germany Country_Yugoslavia Country_ancient Rome \n",
|
||
"0 0.0 0.0 0.0 \n",
|
||
"1 0.0 0.0 0.0 \n",
|
||
"2 0.0 0.0 0.0 \n",
|
||
"3 0.0 0.0 0.0 \n",
|
||
"4 0.0 0.0 0.0 \n",
|
||
"... ... ... ... \n",
|
||
"111726 0.0 0.0 0.0 \n",
|
||
"111727 0.0 0.0 0.0 \n",
|
||
"111728 0.0 0.0 0.0 \n",
|
||
"111729 0.0 0.0 0.0 \n",
|
||
"111730 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
"[111731 rows x 120 columns]"
|
||
]
|
||
},
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
|
||
"\n",
|
||
"encoded_values = encoder.fit_transform(data[[\"Country\"]])\n",
|
||
"\n",
|
||
"encoded_columns = encoder.get_feature_names_out([\"Country\"])\n",
|
||
"\n",
|
||
"encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
|
||
"\n",
|
||
"encoded_values_df\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Далее выполним дискретизацию числовых признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Age of death</th>\n",
|
||
" <th>Age of death</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>67.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>67.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>49.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>56.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>57.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>57.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>42.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>88.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>86.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>61.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>73.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>73.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>42.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>98.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>56.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>56.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>56.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>56.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>63.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>91.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Age of death Age of death\n",
|
||
"0 67.0 middle-aged\n",
|
||
"0 67.0 middle-aged\n",
|
||
"1 49.0 middle-aged\n",
|
||
"2 56.0 middle-aged\n",
|
||
"4 57.0 middle-aged\n",
|
||
"4 57.0 middle-aged\n",
|
||
"5 42.0 middle-aged\n",
|
||
"6 88.0 old\n",
|
||
"7 86.0 old\n",
|
||
"8 61.0 middle-aged\n",
|
||
"9 73.0 middle-aged\n",
|
||
"9 73.0 middle-aged\n",
|
||
"10 42.0 middle-aged\n",
|
||
"12 98.0 old\n",
|
||
"13 56.0 middle-aged\n",
|
||
"14 56.0 middle-aged\n",
|
||
"14 56.0 middle-aged\n",
|
||
"14 56.0 middle-aged\n",
|
||
"16 63.0 middle-aged\n",
|
||
"17 91.0 old"
|
||
]
|
||
},
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"labels = [\"young\", \"middle-aged\", \"old\"]\n",
|
||
"num_bins = 3\n",
|
||
"hist1, bins1 = np.histogram(data[\"Age of death\"].fillna(data[\"Age of death\"].median()), bins=num_bins)\n",
|
||
"pd.concat([data[\"Age of death\"], pd.cut(data[\"Age of death\"], list(bins1), labels=labels)], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Выполнить «ручной» синтез признаков в рамках данного набора данных не является возможным."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Масштабирование признаков на основе нормировки и стандартизации в рамках данного набора данных не является необходимым."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Выполним конструирование признаков с применением фреймворка Featuretools. "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 31,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Built 7 features\n",
|
||
"Elapsed: 00:00 | Progress: 100%|██████████\n",
|
||
" Gender Country Occupation Birth year Death year \\\n",
|
||
"Id \n",
|
||
"Q23 Male United States of America Politician 1732 1799.0 \n",
|
||
"Q42 Male United Kingdom Artist 1952 2001.0 \n",
|
||
"Q91 Male United States of America Politician 1809 1865.0 \n",
|
||
"Q255 Male Holy Roman Empire Artist 1770 1827.0 \n",
|
||
"Q260 Male Kingdom of France Egyptologist 1790 1832.0 \n",
|
||
"\n",
|
||
" Manner of death Age of death \n",
|
||
"Id \n",
|
||
"Q23 natural causes 67.0 \n",
|
||
"Q42 natural causes 49.0 \n",
|
||
"Q91 homicide 56.0 \n",
|
||
"Q255 NaN 57.0 \n",
|
||
"Q260 natural causes 42.0 \n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"data1 = data.drop_duplicates(subset=\"Id\", keep=\"first\")\n",
|
||
"\n",
|
||
"df_train = pd.DataFrame(data1)\n",
|
||
"\n",
|
||
"# Создание EntitySet\n",
|
||
"es = ft.EntitySet(id='death_data')\n",
|
||
"\n",
|
||
"# Добавление DataFrame в EntitySet\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name='deaths',\n",
|
||
" dataframe=df_train,\n",
|
||
" index='Id',\n",
|
||
" make_index=False\n",
|
||
")\n",
|
||
"\n",
|
||
"# Определение примитивов (операций) для конструирования признаков\n",
|
||
"feature_matrix, feature_defs = ft.dfs(\n",
|
||
" entityset=es,\n",
|
||
" target_dataframe_name='deaths',\n",
|
||
" max_depth=2,\n",
|
||
" verbose=1,\n",
|
||
" n_jobs=1\n",
|
||
")\n",
|
||
"\n",
|
||
"# Вывод сгенерированных признаков\n",
|
||
"print(feature_matrix.head())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Все наборы признаков имеют плохую предсказательную способность, высокую скорость вычисления, малую надежность, корреляцию и цельность. Они не являются информативными, как и сам набор данных"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "aimvenv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|