1301 lines
70 KiB
Plaintext
1301 lines
70 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Вариант: Список людей. "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 27,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"RangeIndex: 100000 entries, 0 to 99999\n",
|
|||
|
"Data columns (total 10 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 Id 100000 non-null object \n",
|
|||
|
" 1 Name 100000 non-null object \n",
|
|||
|
" 2 Short description 99923 non-null object \n",
|
|||
|
" 3 Gender 98015 non-null object \n",
|
|||
|
" 4 Country 94533 non-null object \n",
|
|||
|
" 5 Occupation 97299 non-null object \n",
|
|||
|
" 6 Birth year 100000 non-null int64 \n",
|
|||
|
" 7 Death year 99999 non-null float64\n",
|
|||
|
" 8 Manner of death 14821 non-null object \n",
|
|||
|
" 9 Age of death 99999 non-null float64\n",
|
|||
|
"dtypes: float64(2), int64(1), object(7)\n",
|
|||
|
"memory usage: 7.6+ MB\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from sklearn.preprocessing import LabelEncoder\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
|||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"import featuretools as ft\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# Функция для применения oversampling\n",
|
|||
|
"def apply_oversampling(X, y):\n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
|
|||
|
" return X_resampled, y_resampled\n",
|
|||
|
"\n",
|
|||
|
"# Функция для применения undersampling\n",
|
|||
|
"def apply_undersampling(X, y):\n",
|
|||
|
" undersampler = RandomUnderSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
|
|||
|
" return X_resampled, y_resampled\n",
|
|||
|
"\n",
|
|||
|
"def split_stratified_into_train_val_test(\n",
|
|||
|
" df_input,\n",
|
|||
|
" stratify_colname=\"y\",\n",
|
|||
|
" frac_train=0.6,\n",
|
|||
|
" frac_val=0.15,\n",
|
|||
|
" frac_test=0.25,\n",
|
|||
|
" random_state=None,\n",
|
|||
|
"):\n",
|
|||
|
" \"\"\"\n",
|
|||
|
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
|
|||
|
" following fractional ratios provided by the user, where each subset is\n",
|
|||
|
" stratified by the values in a specific column (that is, each subset has\n",
|
|||
|
" the same relative frequency of the values in the column). It performs this\n",
|
|||
|
" splitting by running train_test_split() twice.\n",
|
|||
|
"\n",
|
|||
|
" Parameters\n",
|
|||
|
" ----------\n",
|
|||
|
" df_input : Pandas dataframe\n",
|
|||
|
" Input dataframe to be split.\n",
|
|||
|
" stratify_colname : str\n",
|
|||
|
" The name of the column that will be used for stratification. Usually\n",
|
|||
|
" this column would be for the label.\n",
|
|||
|
" frac_train : float\n",
|
|||
|
" frac_val : float\n",
|
|||
|
" frac_test : float\n",
|
|||
|
" The ratios with which the dataframe will be split into train, val, and\n",
|
|||
|
" test data. The values should be expressed as float fractions and should\n",
|
|||
|
" sum to 1.0.\n",
|
|||
|
" random_state : int, None, or RandomStateInstance\n",
|
|||
|
" Value to be passed to train_test_split().\n",
|
|||
|
"\n",
|
|||
|
" Returns\n",
|
|||
|
" -------\n",
|
|||
|
" df_train, df_val, df_test :\n",
|
|||
|
" Dataframes containing the three splits.\n",
|
|||
|
" \"\"\"\n",
|
|||
|
"\n",
|
|||
|
" if frac_train + frac_val + frac_test != 1.0:\n",
|
|||
|
" raise ValueError(\n",
|
|||
|
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
|||
|
" % (frac_train, frac_val, frac_test)\n",
|
|||
|
" )\n",
|
|||
|
"\n",
|
|||
|
" if stratify_colname not in df_input.columns:\n",
|
|||
|
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
|||
|
"\n",
|
|||
|
" X = df_input # Contains all columns.\n",
|
|||
|
" y = df_input[\n",
|
|||
|
" [stratify_colname]\n",
|
|||
|
" ] # Dataframe of just the column on which to stratify.\n",
|
|||
|
"\n",
|
|||
|
" # Split original dataframe into train and temp dataframes.\n",
|
|||
|
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
|||
|
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
|||
|
" )\n",
|
|||
|
"\n",
|
|||
|
" # Split the temp dataframe into val and test dataframes.\n",
|
|||
|
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
|||
|
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
|||
|
" df_temp,\n",
|
|||
|
" y_temp,\n",
|
|||
|
" stratify=y_temp,\n",
|
|||
|
" test_size=relative_frac_test,\n",
|
|||
|
" random_state=random_state,\n",
|
|||
|
" )\n",
|
|||
|
"\n",
|
|||
|
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
|||
|
"\n",
|
|||
|
" return df_train, df_val, df_test\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\"../data/age.csv\", nrows=100000)\n",
|
|||
|
"df.info()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Такую информацию могут использовать компании связанные с историей/культурой, с GameDev-ом, с созданием кинематографа. Реальные имена могут сделать тот же фильм более историчным. "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Как бизнес-цели выделим следующие 2 варианта:\n",
|
|||
|
" 1) GameDev. Создание игры про конкретного персонажа, живущего в конкретном временном промежутке в конкретной стране. \n",
|
|||
|
" 2) Исследование зависимости длительности жизни от страны проживания.\n",
|
|||
|
" "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Поскольку данные не полные, их необходимо заполнить стандартными значениями:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 9,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Id 0\n",
|
|||
|
"Name 0\n",
|
|||
|
"Short description 77\n",
|
|||
|
"Gender 1985\n",
|
|||
|
"Country 5467\n",
|
|||
|
"Occupation 2701\n",
|
|||
|
"Birth year 0\n",
|
|||
|
"Death year 1\n",
|
|||
|
"Manner of death 85179\n",
|
|||
|
"Age of death 1\n",
|
|||
|
"dtype: int64\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(df.isnull().sum())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 10,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"Index: 99922 entries, 0 to 99999\n",
|
|||
|
"Data columns (total 10 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 Id 99922 non-null object \n",
|
|||
|
" 1 Name 99922 non-null object \n",
|
|||
|
" 2 Short description 99922 non-null object \n",
|
|||
|
" 3 Gender 99922 non-null object \n",
|
|||
|
" 4 Country 99922 non-null object \n",
|
|||
|
" 5 Occupation 99922 non-null object \n",
|
|||
|
" 6 Birth year 99922 non-null int64 \n",
|
|||
|
" 7 Death year 99922 non-null float64\n",
|
|||
|
" 8 Manner of death 99922 non-null object \n",
|
|||
|
" 9 Age of death 99922 non-null float64\n",
|
|||
|
"dtypes: float64(2), int64(1), object(7)\n",
|
|||
|
"memory usage: 8.4+ MB\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Id</th>\n",
|
|||
|
" <th>Name</th>\n",
|
|||
|
" <th>Short description</th>\n",
|
|||
|
" <th>Gender</th>\n",
|
|||
|
" <th>Country</th>\n",
|
|||
|
" <th>Occupation</th>\n",
|
|||
|
" <th>Birth year</th>\n",
|
|||
|
" <th>Death year</th>\n",
|
|||
|
" <th>Manner of death</th>\n",
|
|||
|
" <th>Age of death</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>99995</th>\n",
|
|||
|
" <td>Q729652</td>\n",
|
|||
|
" <td>Jacques-Joseph Moreau</td>\n",
|
|||
|
" <td>French psychiatrist</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>France</td>\n",
|
|||
|
" <td>Psychiatrist; psychologist</td>\n",
|
|||
|
" <td>1804</td>\n",
|
|||
|
" <td>1884.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>80.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>99996</th>\n",
|
|||
|
" <td>Q729661</td>\n",
|
|||
|
" <td>Jerome Wiesner</td>\n",
|
|||
|
" <td>American academic engineer</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>United States of America</td>\n",
|
|||
|
" <td>Researcher</td>\n",
|
|||
|
" <td>1915</td>\n",
|
|||
|
" <td>1994.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>79.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>99997</th>\n",
|
|||
|
" <td>Q729662</td>\n",
|
|||
|
" <td>Westmoreland Davis</td>\n",
|
|||
|
" <td>American politician (1859-1942)</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>United States of America</td>\n",
|
|||
|
" <td>Politician</td>\n",
|
|||
|
" <td>1859</td>\n",
|
|||
|
" <td>1942.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>83.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>99998</th>\n",
|
|||
|
" <td>Q729674</td>\n",
|
|||
|
" <td>John Needham</td>\n",
|
|||
|
" <td>English biologist and Roman Catholic priest</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>England</td>\n",
|
|||
|
" <td>Religious figure</td>\n",
|
|||
|
" <td>1713</td>\n",
|
|||
|
" <td>1810.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>97.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>99999</th>\n",
|
|||
|
" <td>Q729679</td>\n",
|
|||
|
" <td>Francis Bourne</td>\n",
|
|||
|
" <td>Catholic cardinal</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>United Kingdom</td>\n",
|
|||
|
" <td>Religious figure</td>\n",
|
|||
|
" <td>1861</td>\n",
|
|||
|
" <td>1934.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>73.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Id Name \\\n",
|
|||
|
"99995 Q729652 Jacques-Joseph Moreau \n",
|
|||
|
"99996 Q729661 Jerome Wiesner \n",
|
|||
|
"99997 Q729662 Westmoreland Davis \n",
|
|||
|
"99998 Q729674 John Needham \n",
|
|||
|
"99999 Q729679 Francis Bourne \n",
|
|||
|
"\n",
|
|||
|
" Short description Gender \\\n",
|
|||
|
"99995 French psychiatrist Male \n",
|
|||
|
"99996 American academic engineer Male \n",
|
|||
|
"99997 American politician (1859-1942) Male \n",
|
|||
|
"99998 English biologist and Roman Catholic priest Male \n",
|
|||
|
"99999 Catholic cardinal Male \n",
|
|||
|
"\n",
|
|||
|
" Country Occupation Birth year \\\n",
|
|||
|
"99995 France Psychiatrist; psychologist 1804 \n",
|
|||
|
"99996 United States of America Researcher 1915 \n",
|
|||
|
"99997 United States of America Politician 1859 \n",
|
|||
|
"99998 England Religious figure 1713 \n",
|
|||
|
"99999 United Kingdom Religious figure 1861 \n",
|
|||
|
"\n",
|
|||
|
" Death year Manner of death Age of death \n",
|
|||
|
"99995 1884.0 NaN 80.0 \n",
|
|||
|
"99996 1994.0 NaN 79.0 \n",
|
|||
|
"99997 1942.0 NaN 83.0 \n",
|
|||
|
"99998 1810.0 NaN 97.0 \n",
|
|||
|
"99999 1934.0 NaN 73.0 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 10,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df.fillna({\"Gender\": \"NaN\", \"Country\": \"NaN\", \"Occupation\" : \"NaN\", \"Manner of death\" : \"NaN\"}, inplace=True)\n",
|
|||
|
"df = df.dropna()\n",
|
|||
|
"df.info()\n",
|
|||
|
"df.tail()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Данные приращены, удалены только те строки, в которых не было даты смерти или короткого описания"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 11,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<Axes: ylabel='Frequency'>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 11,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAlUAAAGdCAYAAAA7VYb2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA5oUlEQVR4nO3de1xVVeL///cRPCDGxRscGBHxnnezhmFSy9EBLx+zdD5TanljdGpwKjEzv6mp9UjDoqwspykvPbJ0/IxZo+WIl7QSNS+EWg9KU6kR1E9eTmgCwv790Y/98QgiHjZwDuf1fDz2Q/dea++99lnCebv2OvvYDMMwBAAAgCqpV9sNAAAAqAsIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAf/aboA3KCkp0YkTJxQcHCybzVbbzQEAAJVgGIZ++uknRUVFqV696h9HIlRVwokTJxQdHV3bzQAAAG74/vvv1bx582o/D6GqEoKDgyX90ikhISG13BoAAFAZTqdT0dHR5vt4dSNUVULpLb+QkBBCFQAAXqampu4wUR0AAMAChCoAAAALEKoAAAAswJwqixiGocuXL6u4uLi2mwKL1K9fX35+frXdDACAlyBUWaCwsFC5ubm6ePFibTcFFrLZbGrevLluuumm2m4KAMALEKqqqKSkREePHpWfn5+ioqJkt9t5QGgdYBiGTp8+rR9++EFt27ZlxAoAcF2EqioqLCxUSUmJoqOjFRQUVNvNgYWaNWumY8eOqaioiFAFALguJqpbpCYef4+axYgjAOBGkAQAAAAsQKhChY4dOyabzabMzEy39rfZbFq7dq2lbQIAwBMxp6oatXxifY2e79j8wTdUf+zYsVq+fLm53rhxY912221KTU1V165dJUnR0dHKzc1V06ZNKzzW7NmztXbtWrfDFwAA3o6RKh83YMAA5ebmKjc3V5s3b5a/v7/+67/+yyz38/OTw+GQv3/5+bv0+VzerrCwsLabAADwcoQqHxcQECCHwyGHw6Hu3bvriSee0Pfff6/Tp09LKnv775NPPpHNZtPHH3+snj17KiAgQO+8847mzJmjL7/8UjabTTabTcuWLTPP8b//+7+65557FBQUpLZt2+rDDz+8Znvmzp2rzp07l9nevXt3zZw501x/8803dfPNNyswMFAdOnTQa6+95lJ/2rRpateunYKCgtSqVSvNnDlTRUVFZvns2bPVvXt3vfnmm4qNjVVgYKA7Lx8AACZu/8GUn5+vd955R23atFGTJk0qrPvEE0/o+eefV6tWrRQYGKgpU6Zow4YN2rRpkyQpNDTUrDtnzhylpqZqwYIFeuWVVzRq1CgdP35cjRs3LnPc8ePHa86cOfriiy902223SZL279+vrKwsrVmzRpK0YsUKzZo1S6+++qp69Oih/fv3a8KECWrYsKHGjBkjSQoODtayZcsUFRWlAwcOaMKECQoODtbjjz9unuvw4cP65z//qTVr1vDIBABAlRGqfNy6devMJ4ZfuHBBkZGRWrdu3XUfETF37lz9/ve/N9dvuukm+fv7y+FwlKk7duxYjRgxQpL07LPP6uWXX9bu3bs1YMCAMnWbN2+uxMRELV261AxVS5cu1R133KFWrVpJkp566im98MILGjZsmCQpNjZWX331lf72t7+ZoWrGjBnmMVu2bKnHHntMK1eudAlVhYWFevvtt9WsWbPrv1AAgEpp+cT6G57jW1dw+8/H9e3bV5mZmcrMzNTu3buVmJiogQMH6vjx4xXud+utt1b6HKWT3iWpYcOGCgkJ0alTp65Zf8KECXrvvfd06dIlFRYW6t1339X48eMl/RL8jhw5oqSkJN10003m8swzz+jIkSPmMVatWqXbb79dDodDN910k2bMmKGcnByX88TExBCoAACWYaTKxzVs2FBt2rQx1998802Fhobq73//u5555pkK96us+vXru6zbbDaVlJRcs/6QIUMUEBCg999/X3a7XUVFRfrDH/4g6ZdblJL097//XXFxcS77ld7Cy8jI0KhRozRnzhwlJiYqNDRUK1eu1AsvvOD2NQAAcD2EKriw2WyqV6+efv755xvaz263q7i42JI2+Pv7a8yYMVq6dKnsdrvuu+8+NWjQQJIUERGhqKgofffddxo1alS5++/YsUMxMTF68sknzW3XG3kDAKCqCFU+rqCgQHl5eZKks2fP6tVXX1V+fr6GDBlyQ8dp2bKljh49qszMTDVv3lzBwcEKCAhwu11/+tOfdPPNN0uSPv/8c5eyOXPm6OGHH1ZoaKgGDBiggoIC7dmzR2fPnlVKSoratm2rnJwcrVy5UrfddpvWr1+v999/3+22AABQGcyp8nEbNmxQZGSkIiMjFRcXpy+++EKrV6/WnXfeeUPHGT58uAYMGKC+ffuqWbNmeu+996rUrrZt2+q3v/2tOnToUOY235/+9Ce9+eabWrp0qbp06aI77rhDy5YtU2xsrCTprrvu0uTJkzVp0iR1795dO3bscHkcAwAA1cFmGIZR243wdE6nU6GhoTp//rxCQkJcyi5duqSjR4/yrCOLGYahtm3b6i9/+YtSUlJqpQ30LQDcOE/69F9F79/VoVZHqrZv364hQ4YoKiqq3O+IK32Q5NXLggULzDotW7YsUz5//nyX42RlZal3794KDAxUdHS0UlNTa+Ly4KbTp0/r1VdfVV5ensaNG1fbzQEAoFJqdU7VhQsX1K1bN40fP9585tCVcnNzXdY//vhjJSUlafjw4S7b586dqwkTJpjrwcHB5t+dTqcSEhLUv39/LV68WAcOHND48eMVFhamiRMnWnxFsEJ4eLiaNm2qN954Q40aNart5gAAUCm1GqoGDhyogQMHXrP86gdJfvDBB+rbt6/5EMhSwcHB5T50Uvrl6duFhYVasmSJ7Ha7OnXqpMzMTKWlpRGqPBR3pAEA3shrJqqfPHlS69evV1JSUpmy+fPnq0mTJurRo4cWLFjg8gW/GRkZ6tOnj+x2u7ktMTFR2dnZOnv2bLnnKigokNPpdFkAAAAq4jWPVFi+fLmCg4PL3CZ8+OGHdcstt6hx48basWOHpk+frtzcXKWlpUmS8vLyzE+FlYqIiDDLyru9NG/ePM2ZM6eargQAANRFXhOqlixZolGjRpX5FNaVnwzr2rWr7Ha7/vznP2vevHluPydp+vTpLsd1Op2Kjo6ucB9uWdU99CkA4EZ4Raj69NNPlZ2drVWrVl23blxcnC5fvqxjx46pffv2cjgcOnnypEud0vVrzcMKCAiodCAr/QqWixcvmk/9Rt1QWFgo6f++/gYAgIp4Rah666231LNnT3Xr1u26dTMzM1WvXj2Fh4dLkuLj4/Xkk0+qqKjIDEDp6elq3769JZ8s8/PzU1hYmPkFwUFBQbLZbFU+LmpXSUmJTp8+raCgIPn7e8WPCQCgltXqu0V+fr4OHz5srpd+zUnjxo3VokULSb/celu9enWZL8OVfpmEvmvXLvXt21fBwcHKyMjQ5MmTdf/995uBaeTIkZozZ46SkpI0bdo0HTx4UAsXLtSLL75o2XWUjniVBivUDfXq1VOLFi0IyQCASqnVULVnzx717dvXXC+dxzRmzBgtW7ZMkrRy5UoZhqERI0aU2T8gIEArV67U7NmzVVBQoNjYWE2ePNllPlRoaKg2btyo5ORk9ezZU02bNtWsWbMsfZyCzWZTZGSkwsPDVVRUZNlxUbvsdrvq1fOaD8gCAGoZX1NTCTX9mHsAALwVX1MDAACAKiFUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQh
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df.plot.hist(column=[\"Birth year\"], xlim=(1000, 2000), bins=4000)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Помимо этого обработаем колонку страны таким образом, что каждый человек, который жил не в одной стране, будет занимать более одной строки, в соответствии с количеством стран в которых он жил."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 12,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"Index: 116555 entries, 0 to 99999\n",
|
|||
|
"Data columns (total 10 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 Id 116555 non-null object \n",
|
|||
|
" 1 Name 116555 non-null object \n",
|
|||
|
" 2 Short description 116555 non-null object \n",
|
|||
|
" 3 Gender 116555 non-null object \n",
|
|||
|
" 4 Country 116555 non-null object \n",
|
|||
|
" 5 Occupation 116555 non-null object \n",
|
|||
|
" 6 Birth year 116555 non-null int64 \n",
|
|||
|
" 7 Death year 116555 non-null float64\n",
|
|||
|
" 8 Manner of death 116555 non-null object \n",
|
|||
|
" 9 Age of death 116555 non-null float64\n",
|
|||
|
"dtypes: float64(2), int64(1), object(7)\n",
|
|||
|
"memory usage: 9.8+ MB\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df['Country'] = df['Country'].str.split('; ')\n",
|
|||
|
"df = df.explode('Country')\n",
|
|||
|
"df.info()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Далее выполним разбиение на обучающую, контрольную и тестовую выборки."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 15,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"121\n",
|
|||
|
"Обучающая выборка: (67038, 10)\n",
|
|||
|
"Country\n",
|
|||
|
"Germany 15128\n",
|
|||
|
"United States of America 8946\n",
|
|||
|
"France 4715\n",
|
|||
|
"NaN 3248\n",
|
|||
|
"United Kingdom 2796\n",
|
|||
|
" ... \n",
|
|||
|
"Song dynasty 32\n",
|
|||
|
"Paraguay 31\n",
|
|||
|
"Kingdom of Sardinia 31\n",
|
|||
|
"Confederation of the Rhine 30\n",
|
|||
|
"Kingdom of Saxony 30\n",
|
|||
|
"Name: count, Length: 121, dtype: int64\n",
|
|||
|
"Контрольная выборка: (22346, 10)\n",
|
|||
|
"Country\n",
|
|||
|
"Germany 5043\n",
|
|||
|
"United States of America 2982\n",
|
|||
|
"France 1572\n",
|
|||
|
"NaN 1082\n",
|
|||
|
"United Kingdom 932\n",
|
|||
|
" ... \n",
|
|||
|
"Vietnam 11\n",
|
|||
|
"Paraguay 10\n",
|
|||
|
"Kingdom of Saxony 10\n",
|
|||
|
"Confederation of the Rhine 10\n",
|
|||
|
"Kingdom of Sardinia 10\n",
|
|||
|
"Name: count, Length: 121, dtype: int64\n",
|
|||
|
"Тестовая выборка: (22347, 10)\n",
|
|||
|
"Country\n",
|
|||
|
"Germany 5043\n",
|
|||
|
"United States of America 2982\n",
|
|||
|
"France 1572\n",
|
|||
|
"NaN 1083\n",
|
|||
|
"United Kingdom 933\n",
|
|||
|
" ... \n",
|
|||
|
"England 11\n",
|
|||
|
"Confederation of the Rhine 10\n",
|
|||
|
"Paraguay 10\n",
|
|||
|
"Kingdom of Sardinia 10\n",
|
|||
|
"Kingdom of Saxony 10\n",
|
|||
|
"Name: count, Length: 121, dtype: int64\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"data = df.copy()\n",
|
|||
|
"\n",
|
|||
|
"value_counts = data[\"Country\"].value_counts()\n",
|
|||
|
"rare = value_counts[value_counts < 50].index\n",
|
|||
|
"data = data[~data[\"Country\"].isin(rare)]\n",
|
|||
|
"\n",
|
|||
|
"print(len(data[\"Country\"].unique()))\n",
|
|||
|
"\n",
|
|||
|
" \n",
|
|||
|
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
|
|||
|
" data, stratify_colname=\"Country\", frac_train=0.60, frac_val=0.20, frac_test=0.20)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
|||
|
"print(df_train[\"Country\"].value_counts())\n",
|
|||
|
"\n",
|
|||
|
"print(\"Контрольная выборка: \", df_val.shape)\n",
|
|||
|
"print(df_val[\"Country\"].value_counts())\n",
|
|||
|
"\n",
|
|||
|
"print(\"Тестовая выборка: \", df_test.shape)\n",
|
|||
|
"print(df_test[\"Country\"].value_counts())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"В данных были удалены строки, у которых были \"редкие\" страны. Данные наращивать не будем, поскольку в этом нет необходимости\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выполним конструирование признаков. "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Начнем с унитарного кодирования категориальных признаков. Под этот пункт подходит столбец страна"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 16,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Country_Albania</th>\n",
|
|||
|
" <th>Country_Argentina</th>\n",
|
|||
|
" <th>Country_Australia</th>\n",
|
|||
|
" <th>Country_Austria</th>\n",
|
|||
|
" <th>Country_Austria-Hungary</th>\n",
|
|||
|
" <th>Country_Austrian Empire</th>\n",
|
|||
|
" <th>Country_Belgium</th>\n",
|
|||
|
" <th>Country_Bolivia</th>\n",
|
|||
|
" <th>Country_Brazil</th>\n",
|
|||
|
" <th>Country_British Raj</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>Country_United Kingdom of Great Britain and Ireland</th>\n",
|
|||
|
" <th>Country_United States of America</th>\n",
|
|||
|
" <th>Country_Uruguay</th>\n",
|
|||
|
" <th>Country_Venezuela</th>\n",
|
|||
|
" <th>Country_Vietnam</th>\n",
|
|||
|
" <th>Country_Wales</th>\n",
|
|||
|
" <th>Country_Weimar Republic</th>\n",
|
|||
|
" <th>Country_West Germany</th>\n",
|
|||
|
" <th>Country_Yugoslavia</th>\n",
|
|||
|
" <th>Country_ancient Rome</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>111726</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>111727</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>111728</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>111729</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>111730</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>111731 rows × 120 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Country_Albania Country_Argentina Country_Australia \\\n",
|
|||
|
"0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"111726 0.0 0.0 0.0 \n",
|
|||
|
"111727 0.0 0.0 0.0 \n",
|
|||
|
"111728 0.0 0.0 0.0 \n",
|
|||
|
"111729 0.0 0.0 0.0 \n",
|
|||
|
"111730 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" Country_Austria Country_Austria-Hungary Country_Austrian Empire \\\n",
|
|||
|
"0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"111726 0.0 0.0 0.0 \n",
|
|||
|
"111727 0.0 0.0 0.0 \n",
|
|||
|
"111728 0.0 0.0 0.0 \n",
|
|||
|
"111729 0.0 0.0 0.0 \n",
|
|||
|
"111730 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" Country_Belgium Country_Bolivia Country_Brazil Country_British Raj \\\n",
|
|||
|
"0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"111726 0.0 0.0 0.0 0.0 \n",
|
|||
|
"111727 0.0 0.0 0.0 0.0 \n",
|
|||
|
"111728 0.0 0.0 0.0 0.0 \n",
|
|||
|
"111729 0.0 0.0 0.0 0.0 \n",
|
|||
|
"111730 0.0 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" ... Country_United Kingdom of Great Britain and Ireland \\\n",
|
|||
|
"0 ... 0.0 \n",
|
|||
|
"1 ... 0.0 \n",
|
|||
|
"2 ... 0.0 \n",
|
|||
|
"3 ... 0.0 \n",
|
|||
|
"4 ... 0.0 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"111726 ... 0.0 \n",
|
|||
|
"111727 ... 0.0 \n",
|
|||
|
"111728 ... 0.0 \n",
|
|||
|
"111729 ... 0.0 \n",
|
|||
|
"111730 ... 0.0 \n",
|
|||
|
"\n",
|
|||
|
" Country_United States of America Country_Uruguay Country_Venezuela \\\n",
|
|||
|
"0 1.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 \n",
|
|||
|
"3 1.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"111726 0.0 0.0 0.0 \n",
|
|||
|
"111727 1.0 0.0 0.0 \n",
|
|||
|
"111728 1.0 0.0 0.0 \n",
|
|||
|
"111729 0.0 0.0 0.0 \n",
|
|||
|
"111730 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" Country_Vietnam Country_Wales Country_Weimar Republic \\\n",
|
|||
|
"0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"111726 0.0 0.0 0.0 \n",
|
|||
|
"111727 0.0 0.0 0.0 \n",
|
|||
|
"111728 0.0 0.0 0.0 \n",
|
|||
|
"111729 0.0 0.0 0.0 \n",
|
|||
|
"111730 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" Country_West Germany Country_Yugoslavia Country_ancient Rome \n",
|
|||
|
"0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"111726 0.0 0.0 0.0 \n",
|
|||
|
"111727 0.0 0.0 0.0 \n",
|
|||
|
"111728 0.0 0.0 0.0 \n",
|
|||
|
"111729 0.0 0.0 0.0 \n",
|
|||
|
"111730 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
"[111731 rows x 120 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 16,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
|
|||
|
"\n",
|
|||
|
"encoded_values = encoder.fit_transform(data[[\"Country\"]])\n",
|
|||
|
"\n",
|
|||
|
"encoded_columns = encoder.get_feature_names_out([\"Country\"])\n",
|
|||
|
"\n",
|
|||
|
"encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
|
|||
|
"\n",
|
|||
|
"encoded_values_df\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Далее выполним дискретизацию числовых признаков"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 19,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Age of death</th>\n",
|
|||
|
" <th>Age of death</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>67.0</td>\n",
|
|||
|
" <td>middle-aged</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>67.0</td>\n",
|
|||
|
" <td>middle-aged</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>49.0</td>\n",
|
|||
|
" <td>middle-aged</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>56.0</td>\n",
|
|||
|
" <td>middle-aged</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>57.0</td>\n",
|
|||
|
" <td>middle-aged</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>57.0</td>\n",
|
|||
|
" <td>middle-aged</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>42.0</td>\n",
|
|||
|
" <td>middle-aged</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>88.0</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <td>86.0</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <td>61.0</td>\n",
|
|||
|
" <td>middle-aged</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <td>73.0</td>\n",
|
|||
|
" <td>middle-aged</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <td>73.0</td>\n",
|
|||
|
" <td>middle-aged</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10</th>\n",
|
|||
|
" <td>42.0</td>\n",
|
|||
|
" <td>middle-aged</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12</th>\n",
|
|||
|
" <td>98.0</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>13</th>\n",
|
|||
|
" <td>56.0</td>\n",
|
|||
|
" <td>middle-aged</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>14</th>\n",
|
|||
|
" <td>56.0</td>\n",
|
|||
|
" <td>middle-aged</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>14</th>\n",
|
|||
|
" <td>56.0</td>\n",
|
|||
|
" <td>middle-aged</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>14</th>\n",
|
|||
|
" <td>56.0</td>\n",
|
|||
|
" <td>middle-aged</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16</th>\n",
|
|||
|
" <td>63.0</td>\n",
|
|||
|
" <td>middle-aged</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>17</th>\n",
|
|||
|
" <td>91.0</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Age of death Age of death\n",
|
|||
|
"0 67.0 middle-aged\n",
|
|||
|
"0 67.0 middle-aged\n",
|
|||
|
"1 49.0 middle-aged\n",
|
|||
|
"2 56.0 middle-aged\n",
|
|||
|
"4 57.0 middle-aged\n",
|
|||
|
"4 57.0 middle-aged\n",
|
|||
|
"5 42.0 middle-aged\n",
|
|||
|
"6 88.0 old\n",
|
|||
|
"7 86.0 old\n",
|
|||
|
"8 61.0 middle-aged\n",
|
|||
|
"9 73.0 middle-aged\n",
|
|||
|
"9 73.0 middle-aged\n",
|
|||
|
"10 42.0 middle-aged\n",
|
|||
|
"12 98.0 old\n",
|
|||
|
"13 56.0 middle-aged\n",
|
|||
|
"14 56.0 middle-aged\n",
|
|||
|
"14 56.0 middle-aged\n",
|
|||
|
"14 56.0 middle-aged\n",
|
|||
|
"16 63.0 middle-aged\n",
|
|||
|
"17 91.0 old"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 19,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"labels = [\"young\", \"middle-aged\", \"old\"]\n",
|
|||
|
"num_bins = 3\n",
|
|||
|
"hist1, bins1 = np.histogram(data[\"Age of death\"].fillna(data[\"Age of death\"].median()), bins=num_bins)\n",
|
|||
|
"pd.concat([data[\"Age of death\"], pd.cut(data[\"Age of death\"], list(bins1), labels=labels)], axis=1).head(20)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выполнить «ручной» синтез признаков в рамках данного набора данных не является возможным."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Масштабирование признаков на основе нормировки и стандартизации в рамках данного набора данных не является необходимым."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выполним конструирование признаков с применением фреймворка Featuretools. "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 31,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Built 7 features\n",
|
|||
|
"Elapsed: 00:00 | Progress: 100%|██████████\n",
|
|||
|
" Gender Country Occupation Birth year Death year \\\n",
|
|||
|
"Id \n",
|
|||
|
"Q23 Male United States of America Politician 1732 1799.0 \n",
|
|||
|
"Q42 Male United Kingdom Artist 1952 2001.0 \n",
|
|||
|
"Q91 Male United States of America Politician 1809 1865.0 \n",
|
|||
|
"Q255 Male Holy Roman Empire Artist 1770 1827.0 \n",
|
|||
|
"Q260 Male Kingdom of France Egyptologist 1790 1832.0 \n",
|
|||
|
"\n",
|
|||
|
" Manner of death Age of death \n",
|
|||
|
"Id \n",
|
|||
|
"Q23 natural causes 67.0 \n",
|
|||
|
"Q42 natural causes 49.0 \n",
|
|||
|
"Q91 homicide 56.0 \n",
|
|||
|
"Q255 NaN 57.0 \n",
|
|||
|
"Q260 natural causes 42.0 \n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"data1 = data.drop_duplicates(subset=\"Id\", keep=\"first\")\n",
|
|||
|
"\n",
|
|||
|
"df_train = pd.DataFrame(data1)\n",
|
|||
|
"\n",
|
|||
|
"# Создание EntitySet\n",
|
|||
|
"es = ft.EntitySet(id='death_data')\n",
|
|||
|
"\n",
|
|||
|
"# Добавление DataFrame в EntitySet\n",
|
|||
|
"es = es.add_dataframe(\n",
|
|||
|
" dataframe_name='deaths',\n",
|
|||
|
" dataframe=df_train,\n",
|
|||
|
" index='Id',\n",
|
|||
|
" make_index=False\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Определение примитивов (операций) для конструирования признаков\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(\n",
|
|||
|
" entityset=es,\n",
|
|||
|
" target_dataframe_name='deaths',\n",
|
|||
|
" max_depth=2,\n",
|
|||
|
" verbose=1,\n",
|
|||
|
" n_jobs=1\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Вывод сгенерированных признаков\n",
|
|||
|
"print(feature_matrix.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Все наборы признаков имеют плохую предсказательную способность, высокую скорость вычисления, малую надежность, корреляцию и цельность. Они не являются информативными, как и сам набор данных"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "aimvenv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|