From a006a5f4523198b04980624011adbc30ad996d77 Mon Sep 17 00:00:00 2001 From: Serxiolog Date: Fri, 8 Nov 2024 15:59:46 +0400 Subject: [PATCH] Lab_3 --- lab_3/lab_3.ipynb | 1300 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1300 insertions(+) create mode 100644 lab_3/lab_3.ipynb diff --git a/lab_3/lab_3.ipynb b/lab_3/lab_3.ipynb new file mode 100644 index 0000000..6c083fc --- /dev/null +++ b/lab_3/lab_3.ipynb @@ -0,0 +1,1300 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Вариант: Список людей. " + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 100000 entries, 0 to 99999\n", + "Data columns (total 10 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Id 100000 non-null object \n", + " 1 Name 100000 non-null object \n", + " 2 Short description 99923 non-null object \n", + " 3 Gender 98015 non-null object \n", + " 4 Country 94533 non-null object \n", + " 5 Occupation 97299 non-null object \n", + " 6 Birth year 100000 non-null int64 \n", + " 7 Death year 99999 non-null float64\n", + " 8 Manner of death 14821 non-null object \n", + " 9 Age of death 99999 non-null float64\n", + "dtypes: float64(2), int64(1), object(7)\n", + "memory usage: 7.6+ MB\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "from imblearn.under_sampling import RandomUnderSampler\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "import numpy as np\n", + "import featuretools as ft\n", + "\n", + "\n", + "# Функция для применения oversampling\n", + "def apply_oversampling(X, y):\n", + " oversampler = RandomOverSampler(random_state=42)\n", + " X_resampled, y_resampled = oversampler.fit_resample(X, y)\n", + " return X_resampled, y_resampled\n", + "\n", + "# Функция для применения undersampling\n", + "def apply_undersampling(X, y):\n", + " undersampler = RandomUnderSampler(random_state=42)\n", + " X_resampled, y_resampled = undersampler.fit_resample(X, y)\n", + " return X_resampled, y_resampled\n", + "\n", + "def split_stratified_into_train_val_test(\n", + " df_input,\n", + " stratify_colname=\"y\",\n", + " frac_train=0.6,\n", + " frac_val=0.15,\n", + " frac_test=0.25,\n", + " random_state=None,\n", + "):\n", + " \"\"\"\n", + " Splits a Pandas dataframe into three subsets (train, val, and test)\n", + " following fractional ratios provided by the user, where each subset is\n", + " stratified by the values in a specific column (that is, each subset has\n", + " the same relative frequency of the values in the column). It performs this\n", + " splitting by running train_test_split() twice.\n", + "\n", + " Parameters\n", + " ----------\n", + " df_input : Pandas dataframe\n", + " Input dataframe to be split.\n", + " stratify_colname : str\n", + " The name of the column that will be used for stratification. Usually\n", + " this column would be for the label.\n", + " frac_train : float\n", + " frac_val : float\n", + " frac_test : float\n", + " The ratios with which the dataframe will be split into train, val, and\n", + " test data. The values should be expressed as float fractions and should\n", + " sum to 1.0.\n", + " random_state : int, None, or RandomStateInstance\n", + " Value to be passed to train_test_split().\n", + "\n", + " Returns\n", + " -------\n", + " df_train, df_val, df_test :\n", + " Dataframes containing the three splits.\n", + " \"\"\"\n", + "\n", + " if frac_train + frac_val + frac_test != 1.0:\n", + " raise ValueError(\n", + " \"fractions %f, %f, %f do not add up to 1.0\"\n", + " % (frac_train, frac_val, frac_test)\n", + " )\n", + "\n", + " if stratify_colname not in df_input.columns:\n", + " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", + "\n", + " X = df_input # Contains all columns.\n", + " y = df_input[\n", + " [stratify_colname]\n", + " ] # Dataframe of just the column on which to stratify.\n", + "\n", + " # Split original dataframe into train and temp dataframes.\n", + " df_train, df_temp, y_train, y_temp = train_test_split(\n", + " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", + " )\n", + "\n", + " # Split the temp dataframe into val and test dataframes.\n", + " relative_frac_test = frac_test / (frac_val + frac_test)\n", + " df_val, df_test, y_val, y_test = train_test_split(\n", + " df_temp,\n", + " y_temp,\n", + " stratify=y_temp,\n", + " test_size=relative_frac_test,\n", + " random_state=random_state,\n", + " )\n", + "\n", + " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", + "\n", + " return df_train, df_val, df_test\n", + "\n", + "\n", + "df = pd.read_csv(\"../data/age.csv\", nrows=100000)\n", + "df.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Такую информацию могут использовать компании связанные с историей/культурой, с GameDev-ом, с созданием кинематографа. Реальные имена могут сделать тот же фильм более историчным. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Как бизнес-цели выделим следующие 2 варианта:\n", + " 1) GameDev. Создание игры про конкретного персонажа, живущего в конкретном временном промежутке в конкретной стране. \n", + " 2) Исследование зависимости длительности жизни от страны проживания.\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Поскольку данные не полные, их необходимо заполнить стандартными значениями:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Id 0\n", + "Name 0\n", + "Short description 77\n", + "Gender 1985\n", + "Country 5467\n", + "Occupation 2701\n", + "Birth year 0\n", + "Death year 1\n", + "Manner of death 85179\n", + "Age of death 1\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "print(df.isnull().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 99922 entries, 0 to 99999\n", + "Data columns (total 10 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Id 99922 non-null object \n", + " 1 Name 99922 non-null object \n", + " 2 Short description 99922 non-null object \n", + " 3 Gender 99922 non-null object \n", + " 4 Country 99922 non-null object \n", + " 5 Occupation 99922 non-null object \n", + " 6 Birth year 99922 non-null int64 \n", + " 7 Death year 99922 non-null float64\n", + " 8 Manner of death 99922 non-null object \n", + " 9 Age of death 99922 non-null float64\n", + "dtypes: float64(2), int64(1), object(7)\n", + "memory usage: 8.4+ MB\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdNameShort descriptionGenderCountryOccupationBirth yearDeath yearManner of deathAge of death
99995Q729652Jacques-Joseph MoreauFrench psychiatristMaleFrancePsychiatrist; psychologist18041884.0NaN80.0
99996Q729661Jerome WiesnerAmerican academic engineerMaleUnited States of AmericaResearcher19151994.0NaN79.0
99997Q729662Westmoreland DavisAmerican politician (1859-1942)MaleUnited States of AmericaPolitician18591942.0NaN83.0
99998Q729674John NeedhamEnglish biologist and Roman Catholic priestMaleEnglandReligious figure17131810.0NaN97.0
99999Q729679Francis BourneCatholic cardinalMaleUnited KingdomReligious figure18611934.0NaN73.0
\n", + "
" + ], + "text/plain": [ + " Id Name \\\n", + "99995 Q729652 Jacques-Joseph Moreau \n", + "99996 Q729661 Jerome Wiesner \n", + "99997 Q729662 Westmoreland Davis \n", + "99998 Q729674 John Needham \n", + "99999 Q729679 Francis Bourne \n", + "\n", + " Short description Gender \\\n", + "99995 French psychiatrist Male \n", + "99996 American academic engineer Male \n", + "99997 American politician (1859-1942) Male \n", + "99998 English biologist and Roman Catholic priest Male \n", + "99999 Catholic cardinal Male \n", + "\n", + " Country Occupation Birth year \\\n", + "99995 France Psychiatrist; psychologist 1804 \n", + "99996 United States of America Researcher 1915 \n", + "99997 United States of America Politician 1859 \n", + "99998 England Religious figure 1713 \n", + "99999 United Kingdom Religious figure 1861 \n", + "\n", + " Death year Manner of death Age of death \n", + "99995 1884.0 NaN 80.0 \n", + "99996 1994.0 NaN 79.0 \n", + "99997 1942.0 NaN 83.0 \n", + "99998 1810.0 NaN 97.0 \n", + "99999 1934.0 NaN 73.0 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.fillna({\"Gender\": \"NaN\", \"Country\": \"NaN\", \"Occupation\" : \"NaN\", \"Manner of death\" : \"NaN\"}, inplace=True)\n", + "df = df.dropna()\n", + "df.info()\n", + "df.tail()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Данные приращены, удалены только те строки, в которых не было даты смерти или короткого описания" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlUAAAGdCAYAAAA7VYb2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA5oUlEQVR4nO3de1xVVeL///cRPCDGxRscGBHxnnezhmFSy9EBLx+zdD5TanljdGpwKjEzv6mp9UjDoqwspykvPbJ0/IxZo+WIl7QSNS+EWg9KU6kR1E9eTmgCwv790Y/98QgiHjZwDuf1fDz2Q/dea++99lnCebv2OvvYDMMwBAAAgCqpV9sNAAAAqAsIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAf/aboA3KCkp0YkTJxQcHCybzVbbzQEAAJVgGIZ++uknRUVFqV696h9HIlRVwokTJxQdHV3bzQAAAG74/vvv1bx582o/D6GqEoKDgyX90ikhISG13BoAAFAZTqdT0dHR5vt4dSNUVULpLb+QkBBCFQAAXqampu4wUR0AAMAChCoAAAALEKoAAAAswJwqixiGocuXL6u4uLi2mwKL1K9fX35+frXdDACAlyBUWaCwsFC5ubm6ePFibTcFFrLZbGrevLluuumm2m4KAMALEKqqqKSkREePHpWfn5+ioqJkt9t5QGgdYBiGTp8+rR9++EFt27ZlxAoAcF2EqioqLCxUSUmJoqOjFRQUVNvNgYWaNWumY8eOqaioiFAFALguJqpbpCYef4+axYgjAOBGkAQAAAAsQKhChY4dOyabzabMzEy39rfZbFq7dq2lbQIAwBMxp6oatXxifY2e79j8wTdUf+zYsVq+fLm53rhxY912221KTU1V165dJUnR0dHKzc1V06ZNKzzW7NmztXbtWrfDFwAA3o6RKh83YMAA5ebmKjc3V5s3b5a/v7/+67/+yyz38/OTw+GQv3/5+bv0+VzerrCwsLabAADwcoQqHxcQECCHwyGHw6Hu3bvriSee0Pfff6/Tp09LKnv775NPPpHNZtPHH3+snj17KiAgQO+8847mzJmjL7/8UjabTTabTcuWLTPP8b//+7+65557FBQUpLZt2+rDDz+8Znvmzp2rzp07l9nevXt3zZw501x/8803dfPNNyswMFAdOnTQa6+95lJ/2rRpateunYKCgtSqVSvNnDlTRUVFZvns2bPVvXt3vfnmm4qNjVVgYKA7Lx8AACZu/8GUn5+vd955R23atFGTJk0qrPvEE0/o+eefV6tWrRQYGKgpU6Zow4YN2rRpkyQpNDTUrDtnzhylpqZqwYIFeuWVVzRq1CgdP35cjRs3LnPc8ePHa86cOfriiy902223SZL279+vrKwsrVmzRpK0YsUKzZo1S6+++qp69Oih/fv3a8KECWrYsKHGjBkjSQoODtayZcsUFRWlAwcOaMKECQoODtbjjz9unuvw4cP65z//qTVr1vDIBABAlRGqfNy6devMJ4ZfuHBBkZGRWrdu3XUfETF37lz9/ve/N9dvuukm+fv7y+FwlKk7duxYjRgxQpL07LPP6uWXX9bu3bs1YMCAMnWbN2+uxMRELV261AxVS5cu1R133KFWrVpJkp566im98MILGjZsmCQpNjZWX331lf72t7+ZoWrGjBnmMVu2bKnHHntMK1eudAlVhYWFevvtt9WsWbPrv1AAgEpp+cT6G57jW1dw+8/H9e3bV5mZmcrMzNTu3buVmJiogQMH6vjx4xXud+utt1b6HKWT3iWpYcOGCgkJ0alTp65Zf8KECXrvvfd06dIlFRYW6t1339X48eMl/RL8jhw5oqSkJN10003m8swzz+jIkSPmMVatWqXbb79dDodDN910k2bMmKGcnByX88TExBCoAACWYaTKxzVs2FBt2rQx1998802Fhobq73//u5555pkK96us+vXru6zbbDaVlJRcs/6QIUMUEBCg999/X3a7XUVFRfrDH/4g6ZdblJL097//XXFxcS77ld7Cy8jI0KhRozRnzhwlJiYqNDRUK1eu1AsvvOD2NQAAcD2EKriw2WyqV6+efv755xvaz263q7i42JI2+Pv7a8yYMVq6dKnsdrvuu+8+NWjQQJIUERGhqKgofffddxo1alS5++/YsUMxMTF68sknzW3XG3kDAKCqCFU+rqCgQHl5eZKks2fP6tVXX1V+fr6GDBlyQ8dp2bKljh49qszMTDVv3lzBwcEKCAhwu11/+tOfdPPNN0uSPv/8c5eyOXPm6OGHH1ZoaKgGDBiggoIC7dmzR2fPnlVKSoratm2rnJwcrVy5UrfddpvWr1+v999/3+22AABQGcyp8nEbNmxQZGSkIiMjFRcXpy+++EKrV6/WnXfeeUPHGT58uAYMGKC+ffuqWbNmeu+996rUrrZt2+q3v/2tOnToUOY235/+9Ce9+eabWrp0qbp06aI77rhDy5YtU2xsrCTprrvu0uTJkzVp0iR1795dO3bscHkcAwAA1cFmGIZR243wdE6nU6GhoTp//rxCQkJcyi5duqSjR4/yrCOLGYahtm3b6i9/+YtSUlJqpQ30LQDcOE/69F9F79/VoVZHqrZv364hQ4YoKiqq3O+IK32Q5NXLggULzDotW7YsUz5//nyX42RlZal3794KDAxUdHS0UlNTa+Ly4KbTp0/r1VdfVV5ensaNG1fbzQEAoFJqdU7VhQsX1K1bN40fP9585tCVcnNzXdY//vhjJSUlafjw4S7b586dqwkTJpjrwcHB5t+dTqcSEhLUv39/LV68WAcOHND48eMVFhamiRMnWnxFsEJ4eLiaNm2qN954Q40aNart5gAAUCm1GqoGDhyogQMHXrP86gdJfvDBB+rbt6/5EMhSwcHB5T50Uvrl6duFhYVasmSJ7Ha7OnXqpMzMTKWlpRGqPBR3pAEA3shrJqqfPHlS69evV1JSUpmy+fPnq0mTJurRo4cWLFjg8gW/GRkZ6tOnj+x2u7ktMTFR2dnZOnv2bLnnKigokNPpdFkAAAAq4jWPVFi+fLmCg4PL3CZ8+OGHdcstt6hx48basWOHpk+frtzcXKWlpUmS8vLyzE+FlYqIiDDLyru9NG/ePM2ZM6eargQAANRFXhOqlixZolGjRpX5FNaVnwzr2rWr7Ha7/vznP2vevHluPydp+vTpLsd1Op2Kjo6ucB9uWdU99CkA4EZ4Raj69NNPlZ2drVWrVl23blxcnC5fvqxjx46pffv2cjgcOnnypEud0vVrzcMKCAiodCAr/QqWixcvmk/9Rt1QWFgo6f++/gYAgIp4Rah666231LNnT3Xr1u26dTMzM1WvXj2Fh4dLkuLj4/Xkk0+qqKjIDEDp6elq3769JZ8s8/PzU1hYmPkFwUFBQbLZbFU+LmpXSUmJTp8+raCgIPn7e8WPCQCgltXqu0V+fr4OHz5srpd+zUnjxo3VokULSb/celu9enWZL8OVfpmEvmvXLvXt21fBwcHKyMjQ5MmTdf/995uBaeTIkZozZ46SkpI0bdo0HTx4UAsXLtSLL75o2XWUjniVBivUDfXq1VOLFi0IyQCASqnVULVnzx717dvXXC+dxzRmzBgtW7ZMkrRy5UoZhqERI0aU2T8gIEArV67U7NmzVVBQoNjYWE2ePNllPlRoaKg2btyo5ORk9ezZU02bNtWsWbMsfZyCzWZTZGSkwsPDVVRUZNlxUbvsdrvq1fOaD8gCAGoZX1NTCTX9mHsAALwVX1MDAACAKiFUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWKBWQ9X27ds1ZMgQRUVFyWazae3atS7lY8eOlc1mc1kGDBjgUufMmTMaNWqUQkJCFBYWpqSkJOXn57vUycrKUu/evRUYGKjo6GilpqZW96UBAAAfU6uh6sKFC+rWrZsWLVp0zToDBgxQbm6uubz33nsu5aNGjdKhQ4eUnp6udevWafv27Zo4caJZ7nQ6lZCQoJiYGO3du1cLFizQ7Nmz9cYbb1TbdQEAAN/jX5snHzhwoAYOHFhhnYCAADkcjnLLvv76a23YsEFffPGFbr31VknSK6+8okGDBun5559XVFSUVqxYocLCQi1ZskR2u12dOnVSZmam0tLSXMIXAABAVXj8nKpPPvlE4eHhat++vR566CH9+OOPZllGRobCwsLMQCVJ/fv3V7169bRr1y6zTp8+fWS32806iYmJys7O1tmzZ8s9Z0FBgZxOp8sCAABQEY8OVQMGDNDbb7+tzZs367nnntO2bds0cOBAFRcXS5Ly8vIUHh7uso+/v78aN26svLw8s05ERIRLndL10jpXmzdvnkJDQ80lOjra6ksDAAB1TK3e/rue++67z/x7ly5d1LVrV7Vu3VqffPKJ+vXrV23nnT59ulJSUsx1p9NJsAIAABXy6JGqq7Vq1UpNmzbV4cOHJUkOh0OnTp1yqXP58mWdOXPGnIflcDh08uRJlzql69eaqxUQEKCQkBCXBQAAoCJeFap++OEH/fjjj4qMjJQkxcfH69y5c9q7d69ZZ8uWLSopKVFcXJxZZ/v27SoqKjLrpKenq3379mrUqFHNXgAAAKizajVU5efnKzMzU5mZmZKko0ePKjMzUzk5OcrPz9fUqVO1c+dOHTt2TJs3b9bQoUPVpk0bJSYmSpJuvvlmDRgwQBMmTNDu3bv1+eefa9KkSbrvvvsUFRUlSRo5cqTsdruSkpJ06NAhrVq1SgsXLnS5vQcAAFBVtRqq9uzZox49eqhHjx6SpJSUFPXo0UOzZs2Sn5+fsrKydNddd6ldu3ZKSkpSz5499emnnyogIMA8xooVK9ShQwf169dPgwYNUq9evVyeQRUaGqqNGzfq6NGj6tmzp6ZMmaJZs2bxOAUAAGApm2EYRm03wtM5nU6Fhobq/PnzzK8CAKACLZ9Yr2PzB9d2MyTV/Pu3V82pAgAA8FSEKgAAAAsQqgAAACxAqAIAALAAoQoAAMAChCoAAAALEKoAAAAsQKgCAAAuWj6xvrab4JUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWqNVQtX37dg0ZMkRRUVGy2Wxau3atWVZUVKRp06apS5cuatiwoaKiojR69GidOHHC5RgtW7aUzWZzWebPn+9SJysrS71791ZgYKCio6OVmppaE5cHAAB8SK2GqgsXLqhbt25atGhRmbKLFy9q3759mjlzpvbt26c1a9YoOztbd911V5m6c+fOVW5urrn89a9/NcucTqcSEhIUExOjvXv3asGCBZo9e7beeOONar02AADgW/xr8+QDBw7UwIEDyy0LDQ1Venq6y7ZXX31Vv/71r5WTk6MWLVqY24ODg+VwOMo9zooVK1RYWKglS5bIbrerU6dOyszMVFpamiZOnGjdxQAAAJ/mVXOqzp8/L5vNprCwMJft8+fPV5MmTdSjRw8tWLBAly9fNssyMjLUp08f2e12c1tiYqKys7N19uzZmmo6AACo42p1pOpGXLp0SdOmTdOIESMUEhJibn/44Yd1yy23qHHjxtqxY4emT5+u3NxcpaWlSZLy8vIUGxvrcqyIiAizrFGjRmXOVVBQoIKCAnPd6XRWxyUBAIA6xCtCVVFRkf74xz/KMAy9/vrrLmUpKSnm37t27Sq73a4///nPmjdvngICAtw637x58zRnzpwqtRkAAPgWj7/9Vxqojh8/rvT0dJdRqvLExcXp8uXLOnbsmCTJ4XDo5MmTLnVK1681D2v69Ok6f/68uXz//fdVvxAAAFCneXSoKg1U3377rTZt2qQmTZpcd5/MzEzVq1dP4eHhkqT4+Hht375dRUVFZp309HS1b9++3Ft/khQQEKCQkBCXBQAAoCK1evsvPz9fhw8fNtePHj2qzMxMNW7cWJGRkfrDH/6gffv2ad26dSouLlZeXp4kqXHjxrLb7crIyNCuXbvUt29fBQcHKyMjQ5MnT9b9999vBqaRI0dqzpw5SkpK0rRp03Tw4EEtXLhQL774Yq1cMwAAqJtqNVTt2bNHffv2NddL50eNGTNGs2fP1ocffihJ6t69u8t+W7du1Z133qmAgACtXLlSs2fPVkFBgWJjYzV58mSXeVahoaHauHGjkpOT1bNnTzVt2lSzZs3icQoAAMBStRqq7rzzThmGcc3yisok6ZZbbtHOnTuve56uXbvq008/veH2AQAAVJZHz6kCAADwFoQqAAAACxCqAAAALECoAgAAsAChCgAAwAKEKgAAAAsQqgAAACxAqAIAALAAoQoAAMAChCoAAAALEKoAAAAsQKgCAACwAKEKAADAAoQqAAAACxCqAAAALECoAgAAsAChCgAAwAKEKgAAAAsQqgAAACxAqAIAALCAW6Hqu+++s7odAAAAXs2tUNWmTRv17dtX77zzji5dumR1mwAAALyOW6Fq37596tq1q1JSUuRwOPTnP/9Zu3fvtrptAAAAXsOtUNW9e3ctXLhQJ06c0JIlS5Sbm6tevXqpc+fOSktL0+nTp61uJwAAgEer0kR1f39/DRs2TKtXr9Zzzz2nw4cP67HHHlN0dLRGjx6t3Nxcq9oJAADg0aoUqvbs2aO//OUvioyMVFpamh577DEdOXJE6enpOnHihIYOHWpVOwEAADyavzs7paWlaenSpcrOztagQYP09ttva9CgQapX75eMFhsbq2XLlqlly5ZWthUAAMBjuRWqXn/9dY0fP15jx45VZGRkuXXCw8P11ltvValxAAAA3sKtUPXtt99et47dbteYMWPcOTwAAIDXcWtO1dKlS7V69eoy21evXq3ly5dXuVEAAADexq1QNW/ePDVt2rTM9vDwcD377LNVbhQAAIC3cStU5eTkKDY2tsz2mJgY5eTkVLlRAAAA3satUBUeHq6srKwy27/88ks1adKkyo0CAADwNm6FqhEjRujhhx/W1q1bVVxcrOLiYm3ZskWPPPKI7rvvPqvbCAAA4PHc+vTf008/rWPHjqlfv37y9//lECUlJRo9ejRzqgAAgE9yK1TZ7XatWrVKTz/9tL788ks1aNBAXbp0UUxMjNXtAwAA8ApuhapS7dq1U7t27axqCwAAgNdyK1QVFxdr2bJl2rx5s06dOqWSkhKX8i1btljSOAAAAG/h1kT1Rx55RI888oiKi4vVuXNndevWzWWprO3bt2vIkCGKioqSzWbT2rVrXcoNw9CsWbMUGRmpBg0aqH///mWe5n7mzBmNGjVKISEhCgsLU1JSkvLz813qZGVlqXfv3goMDFR0dLRSU1PduWwAAIBrcmukauXKlfrHP/6hQYMGVenkFy5cULdu3TR+/HgNGzasTHlqaqpefvllLV++XLGxsZo5c6YSExP11VdfKTAwUJI0atQo5ebmKj09XUVFRRo3bpwmTpyod999V5LkdDqVkJCg/v37a/HixTpw4IDGjx+vsLAwTZw4sUrtBwAAKOX2RPU2bdpU+eQDBw7UwIEDyy0zDEMvvfSSZsyYoaFDh0qS3n77bUVERGjt2rW677779PXXX2vDhg364osvdOutt0qSXnnlFQ0aNEjPP/+8oqKitGLFChUWFmrJkiWy2+3q1KmTMjMzlZaWRqgCAACWcev235QpU7Rw4UIZhmF1e0xHjx5VXl6e+vfvb24LDQ1VXFycMjIyJEkZGRkKCwszA5Uk9e/fX/Xq1dOuXbvMOn369JHdbjfrJCYmKjs7W2fPni333AUFBXI6nS4LAABARdwaqfrss8+0detWffzxx+rUqZPq16/vUr5mzZoqNywvL0+SFBER4bI9IiLCLMvLy1N4eLhLub+/vxo3buxS5+qv1Ck9Zl5enho1alTm3PPmzdOcOXOqfA0AAMB3uBWqwsLCdM8991jdFo8xffp0paSkmOtOp1PR0dG12CIAAODp3ApVS5cutbodZTgcDknSyZMnFRkZaW4/efKkunfvbtY5deqUy36XL1/WmTNnzP0dDodOnjzpUqd0vbTO1QICAhQQEGDJdQAAAN/g1pwq6ZfwsmnTJv3tb3/TTz/9JEk6ceJEmccZuCs2NlYOh0ObN282tzmdTu3atUvx8fGSpPj4eJ07d0579+4162zZskUlJSWKi4sz62zfvl1FRUVmnfT0dLVv377cW38AAADucCtUHT9+XF26dNHQoUOVnJys06dPS5Kee+45PfbYY5U+Tn5+vjIzM5WZmSnpl8npmZmZysnJkc1m06OPPqpnnnlGH374oQ4cOKDRo0crKipKd999tyTp5ptv1oABAzRhwgTt3r1bn3/+uSZNmqT77rtPUVFRkqSRI0fKbrcrKSlJhw4d0qpVq7Rw4UKX23sAAABV5dbtv0ceeUS33nqrvvzySzVp0sTcfs8992jChAmVPs6ePXvUt29fc7006IwZM0bLli3T448/rgsXLmjixIk6d+6cevXqpQ0bNpjPqJKkFStWaNKkSerXr5/q1aun4cOH6+WXXzbLQ0NDtXHjRiUnJ6tnz55q2rSpZs2axeMUAACApWyGG89FaNKkiXbs2KH27dsrODhYX375pVq1aqVjx46pY8eOunjxYnW0tdY4nU6Fhobq/PnzCgkJqe3mAABQrVo+sV7H5g+u8X2tVtPv327d/ispKVFxcXGZ7T/88IOCg4Or3CgAAABv41aoSkhI0EsvvWSu22w25efn66mnnqryV9cAAAB4I7fmVL3wwgtKTExUx44ddenSJY0cOVLffvutmjZtqvfee8/qNgIAAC/Q8on1td2EWuVWqGrevLm+/PJLrVy5UllZWcrPz1dSUpJGjRqlBg0aWN1GAAAAj+dWqJJ++TqY+++/38q2AAAAeC23QtXbb79dYfno0aPdagwAAIC3cvs5VVcqKirSxYsXZbfbFRQURKgCAMBHeNIjFGqbW5/+O3v2rMuSn5+v7Oxs9erVi4nqAADAJ7n93X9Xa9u2rebPn19mFAsAAMAXWBaqpF8mr584ccLKQwIAAA/n649SKOXWnKoPP/zQZd0wDOXm5urVV1/V7bffbknDAAAAvIlboeruu+92WbfZbGrWrJl+97vf6YUXXrCiXQAAAF7FrVBVUlJidTsAAAC8mqVzqgAAgG9iXpWbI1UpKSmVrpuWlubOKQAAALyKW6Fq//792r9/v4qKitS+fXtJ0jfffCM/Pz/dcsstZj2bzWZNKwEAADycW6FqyJAhCg4O1vLly9WoUSNJvzwQdNy4cerdu7emTJliaSMBAAA8nVtzql544QXNmzfPDFSS1KhRIz3zzDN8+g8AAPgkt0KV0+nU6dOny2w/ffq0fvrppyo3CgAAwNu4FaruuecejRs3TmvWrNEPP/ygH374Qf/85z+VlJSkYcOGWd1GAADgASr7CT9f/SSgW3OqFi9erMcee0wjR45UUVHRLwfy91dSUpIWLFhgaQMBAAC8gVuhKigoSK+99poWLFigI0eOSJJat26thg0bWto4AAAAb1Glh3/m5uYqNzdXbdu2VcOGDWUYhlXtAgAA8Cpuhaoff/xR/fr1U7t27TRo0CDl5uZKkpKSknicAgAAHsBX5zXVJrdC1eTJk1W/fn3l5OQoKCjI3H7vvfdqw4YNljUOAADUDkLZjXNrTtXGjRv173//W82bN3fZ3rZtWx0/ftyShgEAAHgTt0aqLly44DJCVerMmTMKCAiocqMAAAC8jVuhqnfv3nr77bfNdZvNppKSEqWmpqpv376WNQ4AANSelk+sNxdcn1u3/1JTU9WvXz/t2bNHhYWFevzxx3Xo0CGdOXNGn3/+udVtBAAAHqTlE+t1bP7g2m6Gx3FrpKpz58765ptv1KtXLw0dOlQXLlzQsGHDtH//frVu3drqNgIAAA/BqNW13fBIVVFRkQYMGKDFixfrySefrI42AQAAeJ0bHqmqX7++srKyqqMtAADAQzFCdX1u3f67//779dZbb1ndFgAAAK/l1kT1y5cva8mSJdq0aZN69uxZ5jv/0tLSLGkcAACAt7ihUPXdd9+pZcuWOnjwoG655RZJ0jfffONSx2azWdc6AAAAL3FDoapt27bKzc3V1q1bJf3ytTQvv/yyIiIiqqVxAAAA3uKG5lQZhuGy/vHHH+vChQuWNggAAMAbuTVRvdTVIQsAAMBX3VCostlsZeZMVfccqpYtW5rnvXJJTk6WJN15551lyh588EGXY+Tk5Gjw4MEKCgpSeHi4pk6dqsuXL1druwEAgG+5oTlVhmFo7Nix5pcmX7p0SQ8++GCZT/+tWbPGsgZ+8cUXKi4uNtcPHjyo3//+9/rv//5vc9uECRM0d+5cc/3KL3suLi7W4MGD5XA4tGPHDuXm5mr06NGqX7++nn32WcvaCQAAfNsNhaoxY8a4rN9///2WNqY8zZo1c1mfP3++WrdurTvuuMPcFhQUJIfDUe7+Gzdu1FdffaVNmzYpIiJC3bt319NPP61p06Zp9uzZstvt1dp+AADgG24oVC1durS62lEphYWFeuedd5SSkuJy23HFihV655135HA4NGTIEM2cOdMcrcrIyFCXLl1cPqGYmJiohx56SIcOHVKPHj3KnKegoEAFBQXmutPprMarAgAAdYFbD/+sLWvXrtW5c+c0duxYc9vIkSMVExOjqKgoZWVladq0acrOzjZvQebl5ZV55EPpel5eXrnnmTdvnubMmVM9FwEAAOokrwpVb731lgYOHKioqChz28SJE82/d+nSRZGRkerXr5+OHDmi1q1bu3We6dOnKyUlxVx3Op2Kjo52v+EAAKDO85pQdfz4cW3atOm6k+Dj4uIkSYcPH1br1q3lcDi0e/dulzonT56UpGvOwwoICDAn4wMAAFRGlZ5TVZOWLl2q8PBwDR48uMJ6mZmZkqTIyEhJUnx8vA4cOKBTp06ZddLT0xUSEqKOHTtWW3sBAIBv8YqRqpKSEi1dulRjxoyRv///NfnIkSN69913NWjQIDVp0kRZWVmaPHmy+vTpo65du0qSEhIS1LFjRz3wwANKTU1VXl6eZsyYoeTkZEajAACAZbwiVG3atEk5OTkaP368y3a73a5NmzbppZde0oULFxQdHa3hw4drxowZZh0/Pz+tW7dODz30kOLj49WwYUONGTPG5blWAAAAVeUVoSohIaHcr8SJjo7Wtm3brrt/TEyMPvroo+poGgAAgCQvmlMFAADgyQhVAAAAFiBUAQAAU8sn1td2E7wWoQoAAMAChCoAAAALEKoAAAAsQKgCAACW88W5WYQqAAAACxCqAAAALECoAgAAN8wXb+9dD6EKAADAAoQqAAAACxCqAAAALECoAgAAbmFelStCFQAAPurKUERAqjpCFQAAXoog5FkIVQAA+DjCmTUIVQAAQBLhqqoIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAIAP4vEJ1iNUAQAAWIBQBQCAD2PEyjqEKgAAAAsQqgAAQLXwtVEwQhUAAIAFCFUAAAAWIFQBAABYwL+2GwAAAGqOr81zqkmMVAEAAFiAUAUAAGABQhUAAIAFCFUAAKDa+NIcLkIVAACABTw6VM2ePVs2m81l6dChg1l+6dIlJScnq0mTJrrppps0fPhwnTx50uUYOTk5Gjx4sIKCghQeHq6pU6fq8uXLNX0pAACgjvPoUCVJnTp1Um5urrl89tlnZtnkyZP1r3/9S6tXr9a2bdt04sQJDRs2zCwvLi7W4MGDVVhYqB07dmj58uVatmyZZs2aVRuXAgCAT/KVW4Ae/5wqf39/ORyOMtvPnz+vt956S++++65+97vfSZKWLl2qm2++WTt37tRvfvMbbdy4UV999ZU2bdqkiIgIde/eXU8//bSmTZum2bNny2631/TlAACAOsrjR6q+/fZbRUVFqVWrVho1apRycnIkSXv37lVRUZH69+9v1u3QoYNatGihjIwMSVJGRoa6dOmiiIgIs05iYqKcTqcOHTp0zXMWFBTI6XS6LAAAABXx6FAVFxenZcuWacOGDXr99dd19OhR9e7dWz/99JPy8vJkt9sVFhbmsk9ERITy8vIkSXl5eS6BqrS8tOxa5s2bp9DQUHOJjo629sIAAECd49G3/wYOHGj+vWvXroqLi1NMTIz+8Y9/qEGDBtV23unTpyslJcVcdzqdBCsAAFAhjx6pulpYWJjatWunw4cPy+FwqLCwUOfOnXOpc/LkSXMOlsPhKPNpwNL18uZplQoICFBISIjLAgAAUBGvClX5+fk6cuSIIiMj1bNnT9WvX1+bN282y7Ozs5WTk6P4+HhJUnx8vA4cOKBTp06ZddLT0xUSEqKOHTvWePsBALiar3wyzhd49O2/xx57TEOGDFFMTIxOnDihp556Sn5+fhoxYoRCQ0OVlJSklJQUNW7cWCEhIfrrX/+q+Ph4/eY3v5EkJSQkqGPHjnrggQeUmpqqvLw8zZgxQ8nJyQoICKjlqwMAAHWJR4eqH374QSNGjNCPP/6oZs2aqVevXtq5c6eaNWsmSXrxxRdVr149DR8+XAUFBUpMTNRrr71m7u/n56d169bpoYceUnx8vBo2bKgxY8Zo7ty5tXVJAACgjvLoULVy5coKywMDA7Vo0SItWrTomnViYmL00UcfWd00AAAAF141pwoAAMBTEaoAAPARTIqvXoQqAAAACxCqAAAALECoAgDAB3Drr/oRqgAAACxAqAIAoI7zhFEqT2hDdSNUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAHVIyyfWm48v8IXHGHgSQhUAAKgRdT3kEaoAAAAsQKgCAKAOq+ujQ56EUAUAAGABQhUAAIAFCFUAANQR3OqrXYQqAAAACxCqAABAjanLo2mEKgAAAAsQqgAA8HJ1efTHmxCqAABAjaqrIZBQBQBAHVBXg4o3IVQBAABYgFAFAABgAUIVAABejNt+noNQBQAAYAFCFQAAgAUIVQAAeCFu+3keQhUAAF6g5RPrzSBFoPJMhCoAAAALEKoAAKgiRo4gEaoAAPAqBDjPRagCAACwgEeHqnnz5um2225TcHCwwsPDdffddys7O9ulzp133imbzeayPPjggy51cnJyNHjwYAUFBSk8PFxTp07V5cuXa/JSAAC4IVeOSDE65R08OlRt27ZNycnJ2rlzp9LT01VUVKSEhARduHDBpd6ECROUm5trLqmpqWZZcXGxBg8erMLCQu3YsUPLly/XsmXLNGvWrJq+HAAAXPhyWKqL1+5f2w2oyIYNG1zWly1bpvDwcO3du1d9+vQxtwcFBcnhcJR7jI0bN+qrr77Spk2bFBERoe7du+vpp5/WtGnTNHv2bNnt9mq9BgAAylMXQ4Wv8+iRqqudP39ektS4cWOX7StWrFDTpk3VuXNnTZ8+XRcvXjTLMjIy1KVLF0VERJjbEhMT5XQ6dejQoZppOAAAlXTl86h8jbdft0ePVF2ppKREjz76qG6//XZ17tzZ3D5y5EjFxMQoKipKWVlZmjZtmrKzs7VmzRpJUl5enkugkmSu5+XllXuugoICFRQUmOtOp9PqywEAwNTyifU6Nn/wNcvgHbwmVCUnJ+vgwYP67LPPXLZPnDjR/HuXLl0UGRmpfv366ciRI2rdurVb55o3b57mzJlTpfYCAFBVBCrv4hW3/yZNmqR169Zp69atat68eYV14+LiJEmHDx+WJDkcDp08edKlTun6teZhTZ8+XefPnzeX77//vqqXAADwcQSkus+jQ5VhGJo0aZLef/99bdmyRbGxsdfdJzMzU5IUGRkpSYqPj9eBAwd06tQps056erpCQkLUsWPHco8REBCgkJAQlwUAAHdd+Z19hKu6y6Nv/yUnJ+vdd9/VBx98oODgYHMOVGhoqBo0aKAjR47o3Xff1aBBg9SkSRNlZWVp8uTJ6tOnj7p27SpJSkhIUMeOHfXAAw8oNTVVeXl5mjFjhpKTkxUQEFCblwcA8FEEq7rJo0eqXn/9dZ0/f1533nmnIiMjzWXVqlWSJLvdrk2bNikhIUEdOnTQlClTNHz4cP3rX/8yj+Hn56d169bJz89P8fHxuv/++zV69GjNnTu3ti4LAIByEba8m0ePVBmGUWF5dHS0tm3bdt3jxMTE6KOPPrKqWQAAXFdFn+hD3eTRI1UAAHgjRpwqp669ToQqAACqUWWDQ10LGJVVl66bUAUAgKrnzb0uBQZcH6EKAADAAoQqAAAACxCqAACwCLf7fBuhCgAAwAKEKgAALMRole8iVAEAYAHCFAhVAAAAFiBUAfB5jDAAsAKhCgAAwAKEKgAAAAsQqgAAACxAqAIAoAqYk4dShCoAANxAmLJOXXktCVUAgDqjpt+c60oYgDUIVQAAlIPAVLPqwutNqAIA+KxrvZFfub0uvNmjZhCqAAA+qTJhqbxw1fKJ9QQtlItQBQDwORWFIgIT3EWoAgB4laqGHncC1ZWjVMC1EKoAAD6BQITqRqgCAPg8Apfn8OY5a4QqAIBP89Y38LrOG/uFUAUAAGABQhW8jjf+7wWAq+r4OeZ3A2oboQqAR+ONsnp5++tbXvuv9Wypa9WHZ/OmPiNUAaizvOmX8fXUpWupKQQp1DRCFQCfUlufLKrKOT01FHhquyTPbhsqz9v6kVAFS3nbDwBQVTfyb74u/nzUZlhkJAqehlAFwOvU5puor57bXTcafK43RwrwZIQqAECtcic0Xb0PwQuegFAFlINf0N6nJvqsJv9d1ORttfI+LVfRscqbl+buftdr05V/evOTtlF13tD/hCoAddL1fvl62i9nd9tzrbBx5baK/l7ZNlWmfVe341rncTeowXd5y78FQhWASqmJkZPq/sVp1fGt+h/z9YJEReHjWm2o7PEq2tedNgE15UbDfk0iVMFj1NaniDzth7Ku8ZbX90ZGcMrbt6LjubO/O6r756A6roeQBnd46r8RQhVqRV2b/3K983rqL4DqYOX/ImszbFw9InSjo0JXH+fqOpUdKfKlfzvAjfDEnw2fClWLFi1Sy5YtFRgYqLi4OO3evbu2m4Qa5ukjWp4SQjxBVUc7KlPHynk8NzLK5Q395A1tBCTP+hCDz4SqVatWKSUlRU899ZT27dunbt26KTExUadOnXL7mLU1IuHuLYqqnMsXVfcPaUXHvlbZjW6/sry6AqW7o0BX/r2i9av3uV4IutbIUE3y5Z8bwJf5TKhKS0vThAkTNG7cOHXs2FGLFy9WUFCQlixZUqXjVvV/tdV9nqv3vd6bo5XXcyPbrle3ojBh1YRhK/cr739OlQ0FVeXuv63KhPUr67hzHRUFnsqe91rHqmwbANRdtf07wGYYhlGrLagBhYWFCgoK0v/8z//o7rvvNrePGTNG586d0wcffOBSv6CgQAUFBeb6+fPn1aJFC33//fcKCQlR56f+7VL/4JxEdX7q3+X+KcmsX7p+5bYrj1Fe2ZXHuPp4V567PNcqu3p7efUqs6289txIOyrjRvetTP2qtMdq7vTfjR77yuNYce2e9PoBQHlK35+cTqeio6N17tw5hYaGVv+JDR/wn//8x5Bk7Nixw2X71KlTjV//+tdl6j/11FOGJBYWFhYWFpY6sBw5cqRG8oa/UMb06dOVkpJirp87d04xMTHKycmpmaSLCpX+z6N05BC1h77wHPSF56AvPEfpnabGjRvXyPl8IlQ1bdpUfn5+OnnypMv2kydPyuFwlKkfEBCggICAMttDQ0P5AfEgISEh9IeHoC88B33hOegLz1GvXs1MIfeJiep2u109e/bU5s2bzW0lJSXavHmz4uPja7FlAACgrvCJkSpJSklJ0ZgxY3Trrbfq17/+tV566SVduHBB48aNq+2mAQCAOsBnQtW9996r06dPa9asWcrLy1P37t21YcMGRUREXHffgIAAPfXUU+XeEkTNoz88B33hOegLz0FfeI6a7gufeKQCAABAdfOJOVUAAADVjVAFAABgAUIVAACABQhVAAAAFvCZULV9+3YNGTJEUVFRstlsWrt2rUu5YRiaNWuWIiMj1aBBA/Xv31/ffvutS50zZ85o1KhRCgkJUVhYmJKSkpSfn+9SJysrS71791ZgYKCio6OVmppa3ZfmlSrqj6KiIk2bNk1dunRRw4YNFRUVpdGjR+vEiRMux6A/rHG9n40rPfjgg7LZbHrppZdcttMX1qhMX3z99de66667FBoaqoYNG+q2225TTk6OWX7p0iUlJyerSZMmuummmzR8+PAyDz7OycnR4MGDFRQUpPDwcE2dOlWXL1+u7svzKtfri/z8fE2aNEnNmzdXgwYN1LFjRy1evNilDn1hjXnz5um2225TcHCwwsPDdffddys7O9uljlWv9SeffKJbbrlFAQEBatOmjZYtW3ZDbfWZUHXhwgV169ZNixYtKrc8NTVVL7/8shYvXqxdu3apYcOGSkxM1KVLl8w6o0aN0qFDh5Senq5169Zp+/btmjhxolnudDqVkJCgmJgY7d27VwsWLNDs2bP1xhtvVPv1eZuK+uPixYvat2+fZs6cqX379mnNmjXKzs7WXXfd5VKP/rDG9X42Sr3//vvauXOnoqKiypTRF9a4Xl8cOXJEvXr1UocOHfTJJ58oKytLM2fOVGBgoFln8uTJ+te//qXVq1dr27ZtOnHihIYNG2aWFxcXa/DgwSosLNSOHTu0fPlyLVu2TLNmzar26/Mm1+uLlJQUbdiwQe+8846+/vprPfroo5o0aZI+/PBDsw59YY1t27YpOTlZO3fuVHp6uoqKipSQkKALFy6Ydax4rY8eParBgwerb9++yszM1KOPPqo//elP+ve/b+AL5GvkGwY9jCTj/fffN9dLSkoMh8NhLFiwwNx27tw5IyAgwHjvvfcMwzCMr776ypBkfPHFF2adjz/+2LDZbMZ//vMfwzAM47XXXjMaNWpkFBQUmHWmTZtmtG/fvpqvyLtd3R/l2b17tyHJOH78uGEY9Ed1uVZf/PDDD8avfvUr4+DBg0ZMTIzx4osvmmX0RfUory/uvfde4/7777/mPufOnTPq169vrF692tz29ddfG5KMjIwMwzAM46OPPjLq1atn5OXlmXVef/11IyQkxKV/8H/K64tOnToZc+fOddl2yy23GE8++aRhGPRFdTp16pQhydi2bZthGNa91o8//rjRqVMnl3Pde++9RmJiYqXb5jMjVRU5evSo8vLy1L9/f3NbaGio4uLilJGRIUnKyMhQWFiYbr31VrNO//79Va9ePe3atcus06dPH9ntdrNOYmKisrOzdfbs2Rq6mrrp/PnzstlsCgsLk0R/1KSSkhI98MADmjp1qjp16lSmnL6oGSUlJVq/fr3atWunxMREhYeHKy4uzuW21N69e1VUVOTyu6xDhw5q0aKFy++yLl26uDz4ODExUU6nU4cOHaqx6/F2v/3tb/Xhhx/qP//5jwzD0NatW/XNN98oISFBEn1Rnc6fPy9J5pckW/VaZ2RkuByjtE7pMSqDUCUpLy9Pkso8XT0iIsIsy8vLU3h4uEu5v7+/Gjdu7FKnvGNceQ7cuEuXLmnatGkaMWKE+eWk9EfNee655+Tv76+HH3643HL6omacOnVK+fn5mj9/vgYMGKCNGzfqnnvu0bBhw7Rt2zZJv7yWdrvd/M9Hqat/l9EXVffKK6+oY8eOat68uex2uwYMGKBFixapT58+kuiL6lJSUqJHH31Ut99+uzp37izJutf6WnWcTqd+/vnnSrXPZ76mBt6pqKhIf/zjH2UYhl5//fXabo7P2bt3rxYuXKh9+/bJZrPVdnN8WklJiSRp6NChmjx5siSpe/fu2rFjhxYvXqw77rijNpvnc1555RXt3LlTH374oWJiYrR9+3YlJycrKiqqzGgHrJOcnKyDBw/qs88+q+2mlIuRKkkOh0OSynxS4OTJk2aZw+HQqVOnXMovX76sM2fOuNQp7xhXngOVVxqojh8/rvT0dHOUSqI/asqnn36qU6dOqUWLFvL395e/v7+OHz+uKVOmqGXLlpLoi5rStGlT+fv7q2PHji7bb775ZvPTfw6HQ4WFhTp37pxLnat/l9EXVfPzzz/r//2//6e0tDQNGTJEXbt21aRJk3Tvvffq+eefl0RfVIdJkyZp3bp12rp1q5o3b25ut+q1vladkJAQNWjQoFJtJFRJio2NlcPh0ObNm81tTqdTu3btUnx8vCQpPj5e586d0969e806W7ZsUUlJieLi4sw627dvV1FRkVknPT1d7du3V6NGjWroauqG0kD17bffatOmTWrSpIlLOf1RMx544AFlZWUpMzPTXKKiojR16lTzEzH0Rc2w2+267bbbynyU/JtvvlFMTIwkqWfPnqpfv77L77Ls7Gzl5OS4/C47cOCASxAu/U/L1YEN5SsqKlJRUZHq1XN9C/Xz8zNHFOkL6xiGoUmTJun999/Xli1bFBsb61Ju1WsdHx/vcozSOqXHqGxjfcJPP/1k7N+/39i/f78hyUhLSzP2799vfpps/vz5RlhYmPHBBx8YWVlZxtChQ43Y2Fjj559/No8xYMAAo0ePHsauXbuMzz77zGjbtq0xYsQIs/zcuXNGRESE8cADDxgHDx40Vq5caQQFBRl/+9vfavx6PV1F/VFYWGjcddddRvPmzY3MzEwjNzfXXK78RAz9YY3r/Wxc7epP/xkGfWGV6/XFmjVrjPr16xtvvPGG8e233xqvvPKK4efnZ3z66afmMR588EGjRYsWxpYtW4w9e/YY8fHxRnx8vFl++fJlo3PnzkZCQoKRmZlpbNiwwWjWrJkxffr0Gr9eT3a9vrjjjjuMTp06GVu3bjW+++47Y+nSpUZgYKDx2muvmcegL6zx0EMPGaGhocYnn3zi8n5w8eJFs44Vr/V3331nBAUFGVOnTjW+/vprY9GiRYafn5+xYcOGSrfVZ0LV1q1bDUllljFjxhiG8ctjFWbOnGlEREQYAQEBRr9+/Yzs7GyXY/z444/GiBEjjJtuuskICQkxxo0bZ/z0008udb788kujV69eRkBAgPGrX/3KmD9/fk1dolepqD+OHj1abpkkY+vWreYx6A9rXO9n42rlhSr6whqV6Yu33nrLaNOmjREYGGh069bNWLt2rcsxfv75Z+Mvf/mL0ahRIyMoKMi45557jNzcXJc6x44dMwYOHGg0aNDAaNq0qTFlyhSjqKioJi7Ra1yvL3Jzc42xY8caUVFRRmBgoNG+fXvjhRdeMEpKSsxj0BfWuNb7wdKlS806Vr3WW7duNbp3727Y7XajVatWLueoDNv/32AAAABUAXOqAAAALECoAgAAsAChCgAAwAKEKgAAAAsQqgAAACxAqAIAALAAoQoAAMAChCoAAAALEKoAAAAsQKgCAACwAKEKAADAAoQqAAAAC/x/L/USoZWJ/b0AAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.plot.hist(column=[\"Birth year\"], xlim=(1000, 2000), bins=4000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Помимо этого обработаем колонку страны таким образом, что каждый человек, который жил не в одной стране, будет занимать более одной строки, в соответствии с количеством стран в которых он жил." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 116555 entries, 0 to 99999\n", + "Data columns (total 10 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Id 116555 non-null object \n", + " 1 Name 116555 non-null object \n", + " 2 Short description 116555 non-null object \n", + " 3 Gender 116555 non-null object \n", + " 4 Country 116555 non-null object \n", + " 5 Occupation 116555 non-null object \n", + " 6 Birth year 116555 non-null int64 \n", + " 7 Death year 116555 non-null float64\n", + " 8 Manner of death 116555 non-null object \n", + " 9 Age of death 116555 non-null float64\n", + "dtypes: float64(2), int64(1), object(7)\n", + "memory usage: 9.8+ MB\n" + ] + } + ], + "source": [ + "df['Country'] = df['Country'].str.split('; ')\n", + "df = df.explode('Country')\n", + "df.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Далее выполним разбиение на обучающую, контрольную и тестовую выборки." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "121\n", + "Обучающая выборка: (67038, 10)\n", + "Country\n", + "Germany 15128\n", + "United States of America 8946\n", + "France 4715\n", + "NaN 3248\n", + "United Kingdom 2796\n", + " ... \n", + "Song dynasty 32\n", + "Paraguay 31\n", + "Kingdom of Sardinia 31\n", + "Confederation of the Rhine 30\n", + "Kingdom of Saxony 30\n", + "Name: count, Length: 121, dtype: int64\n", + "Контрольная выборка: (22346, 10)\n", + "Country\n", + "Germany 5043\n", + "United States of America 2982\n", + "France 1572\n", + "NaN 1082\n", + "United Kingdom 932\n", + " ... \n", + "Vietnam 11\n", + "Paraguay 10\n", + "Kingdom of Saxony 10\n", + "Confederation of the Rhine 10\n", + "Kingdom of Sardinia 10\n", + "Name: count, Length: 121, dtype: int64\n", + "Тестовая выборка: (22347, 10)\n", + "Country\n", + "Germany 5043\n", + "United States of America 2982\n", + "France 1572\n", + "NaN 1083\n", + "United Kingdom 933\n", + " ... \n", + "England 11\n", + "Confederation of the Rhine 10\n", + "Paraguay 10\n", + "Kingdom of Sardinia 10\n", + "Kingdom of Saxony 10\n", + "Name: count, Length: 121, dtype: int64\n" + ] + } + ], + "source": [ + "data = df.copy()\n", + "\n", + "value_counts = data[\"Country\"].value_counts()\n", + "rare = value_counts[value_counts < 50].index\n", + "data = data[~data[\"Country\"].isin(rare)]\n", + "\n", + "print(len(data[\"Country\"].unique()))\n", + "\n", + " \n", + "df_train, df_val, df_test = split_stratified_into_train_val_test(\n", + " data, stratify_colname=\"Country\", frac_train=0.60, frac_val=0.20, frac_test=0.20)\n", + "\n", + "print(\"Обучающая выборка: \", df_train.shape)\n", + "print(df_train[\"Country\"].value_counts())\n", + "\n", + "print(\"Контрольная выборка: \", df_val.shape)\n", + "print(df_val[\"Country\"].value_counts())\n", + "\n", + "print(\"Тестовая выборка: \", df_test.shape)\n", + "print(df_test[\"Country\"].value_counts())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В данных были удалены строки, у которых были \"редкие\" страны. Данные наращивать не будем, поскольку в этом нет необходимости\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Выполним конструирование признаков. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Начнем с унитарного кодирования категориальных признаков. Под этот пункт подходит столбец страна" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Country_AlbaniaCountry_ArgentinaCountry_AustraliaCountry_AustriaCountry_Austria-HungaryCountry_Austrian EmpireCountry_BelgiumCountry_BoliviaCountry_BrazilCountry_British Raj...Country_United Kingdom of Great Britain and IrelandCountry_United States of AmericaCountry_UruguayCountry_VenezuelaCountry_VietnamCountry_WalesCountry_Weimar RepublicCountry_West GermanyCountry_YugoslaviaCountry_ancient Rome
00.00.00.00.00.00.00.00.00.00.0...0.01.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
30.00.00.00.00.00.00.00.00.00.0...0.01.00.00.00.00.00.00.00.00.0
40.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
..................................................................
1117260.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
1117270.00.00.00.00.00.00.00.00.00.0...0.01.00.00.00.00.00.00.00.00.0
1117280.00.00.00.00.00.00.00.00.00.0...0.01.00.00.00.00.00.00.00.00.0
1117290.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
1117300.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
\n", + "

111731 rows × 120 columns

\n", + "
" + ], + "text/plain": [ + " Country_Albania Country_Argentina Country_Australia \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "111726 0.0 0.0 0.0 \n", + "111727 0.0 0.0 0.0 \n", + "111728 0.0 0.0 0.0 \n", + "111729 0.0 0.0 0.0 \n", + "111730 0.0 0.0 0.0 \n", + "\n", + " Country_Austria Country_Austria-Hungary Country_Austrian Empire \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "111726 0.0 0.0 0.0 \n", + "111727 0.0 0.0 0.0 \n", + "111728 0.0 0.0 0.0 \n", + "111729 0.0 0.0 0.0 \n", + "111730 0.0 0.0 0.0 \n", + "\n", + " Country_Belgium Country_Bolivia Country_Brazil Country_British Raj \\\n", + "0 0.0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 \n", + "... ... ... ... ... \n", + "111726 0.0 0.0 0.0 0.0 \n", + "111727 0.0 0.0 0.0 0.0 \n", + "111728 0.0 0.0 0.0 0.0 \n", + "111729 0.0 0.0 0.0 0.0 \n", + "111730 0.0 0.0 0.0 0.0 \n", + "\n", + " ... Country_United Kingdom of Great Britain and Ireland \\\n", + "0 ... 0.0 \n", + "1 ... 0.0 \n", + "2 ... 0.0 \n", + "3 ... 0.0 \n", + "4 ... 0.0 \n", + "... ... ... \n", + "111726 ... 0.0 \n", + "111727 ... 0.0 \n", + "111728 ... 0.0 \n", + "111729 ... 0.0 \n", + "111730 ... 0.0 \n", + "\n", + " Country_United States of America Country_Uruguay Country_Venezuela \\\n", + "0 1.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 1.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "111726 0.0 0.0 0.0 \n", + "111727 1.0 0.0 0.0 \n", + "111728 1.0 0.0 0.0 \n", + "111729 0.0 0.0 0.0 \n", + "111730 0.0 0.0 0.0 \n", + "\n", + " Country_Vietnam Country_Wales Country_Weimar Republic \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "111726 0.0 0.0 0.0 \n", + "111727 0.0 0.0 0.0 \n", + "111728 0.0 0.0 0.0 \n", + "111729 0.0 0.0 0.0 \n", + "111730 0.0 0.0 0.0 \n", + "\n", + " Country_West Germany Country_Yugoslavia Country_ancient Rome \n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "111726 0.0 0.0 0.0 \n", + "111727 0.0 0.0 0.0 \n", + "111728 0.0 0.0 0.0 \n", + "111729 0.0 0.0 0.0 \n", + "111730 0.0 0.0 0.0 \n", + "\n", + "[111731 rows x 120 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n", + "\n", + "encoded_values = encoder.fit_transform(data[[\"Country\"]])\n", + "\n", + "encoded_columns = encoder.get_feature_names_out([\"Country\"])\n", + "\n", + "encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n", + "\n", + "encoded_values_df\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Далее выполним дискретизацию числовых признаков" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Age of deathAge of death
067.0middle-aged
067.0middle-aged
149.0middle-aged
256.0middle-aged
457.0middle-aged
457.0middle-aged
542.0middle-aged
688.0old
786.0old
861.0middle-aged
973.0middle-aged
973.0middle-aged
1042.0middle-aged
1298.0old
1356.0middle-aged
1456.0middle-aged
1456.0middle-aged
1456.0middle-aged
1663.0middle-aged
1791.0old
\n", + "
" + ], + "text/plain": [ + " Age of death Age of death\n", + "0 67.0 middle-aged\n", + "0 67.0 middle-aged\n", + "1 49.0 middle-aged\n", + "2 56.0 middle-aged\n", + "4 57.0 middle-aged\n", + "4 57.0 middle-aged\n", + "5 42.0 middle-aged\n", + "6 88.0 old\n", + "7 86.0 old\n", + "8 61.0 middle-aged\n", + "9 73.0 middle-aged\n", + "9 73.0 middle-aged\n", + "10 42.0 middle-aged\n", + "12 98.0 old\n", + "13 56.0 middle-aged\n", + "14 56.0 middle-aged\n", + "14 56.0 middle-aged\n", + "14 56.0 middle-aged\n", + "16 63.0 middle-aged\n", + "17 91.0 old" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labels = [\"young\", \"middle-aged\", \"old\"]\n", + "num_bins = 3\n", + "hist1, bins1 = np.histogram(data[\"Age of death\"].fillna(data[\"Age of death\"].median()), bins=num_bins)\n", + "pd.concat([data[\"Age of death\"], pd.cut(data[\"Age of death\"], list(bins1), labels=labels)], axis=1).head(20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Выполнить «ручной» синтез признаков в рамках данного набора данных не является возможным." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Масштабирование признаков на основе нормировки и стандартизации в рамках данного набора данных не является необходимым." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Выполним конструирование признаков с применением фреймворка Featuretools. " + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Built 7 features\n", + "Elapsed: 00:00 | Progress: 100%|██████████\n", + " Gender Country Occupation Birth year Death year \\\n", + "Id \n", + "Q23 Male United States of America Politician 1732 1799.0 \n", + "Q42 Male United Kingdom Artist 1952 2001.0 \n", + "Q91 Male United States of America Politician 1809 1865.0 \n", + "Q255 Male Holy Roman Empire Artist 1770 1827.0 \n", + "Q260 Male Kingdom of France Egyptologist 1790 1832.0 \n", + "\n", + " Manner of death Age of death \n", + "Id \n", + "Q23 natural causes 67.0 \n", + "Q42 natural causes 49.0 \n", + "Q91 homicide 56.0 \n", + "Q255 NaN 57.0 \n", + "Q260 natural causes 42.0 \n" + ] + } + ], + "source": [ + "data1 = data.drop_duplicates(subset=\"Id\", keep=\"first\")\n", + "\n", + "df_train = pd.DataFrame(data1)\n", + "\n", + "# Создание EntitySet\n", + "es = ft.EntitySet(id='death_data')\n", + "\n", + "# Добавление DataFrame в EntitySet\n", + "es = es.add_dataframe(\n", + " dataframe_name='deaths',\n", + " dataframe=df_train,\n", + " index='Id',\n", + " make_index=False\n", + ")\n", + "\n", + "# Определение примитивов (операций) для конструирования признаков\n", + "feature_matrix, feature_defs = ft.dfs(\n", + " entityset=es,\n", + " target_dataframe_name='deaths',\n", + " max_depth=2,\n", + " verbose=1,\n", + " n_jobs=1\n", + ")\n", + "\n", + "# Вывод сгенерированных признаков\n", + "print(feature_matrix.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Все наборы признаков имеют плохую предсказательную способность, высокую скорость вычисления, малую надежность, корреляцию и цельность. Они не являются информативными, как и сам набор данных" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "aimvenv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}