4279 lines
160 KiB
Plaintext
4279 lines
160 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Загрузка набора данных Titanic"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Survived</th>\n",
|
||
" <th>Pclass</th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Sex</th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>SibSp</th>\n",
|
||
" <th>Parch</th>\n",
|
||
" <th>Ticket</th>\n",
|
||
" <th>Fare</th>\n",
|
||
" <th>Cabin</th>\n",
|
||
" <th>Embarked</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>PassengerId</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Braund, Mr. Owen Harris</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>A/5 21171</td>\n",
|
||
" <td>7.2500</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>PC 17599</td>\n",
|
||
" <td>71.2833</td>\n",
|
||
" <td>C85</td>\n",
|
||
" <td>C</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Heikkinen, Miss. Laina</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>STON/O2. 3101282</td>\n",
|
||
" <td>7.9250</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>113803</td>\n",
|
||
" <td>53.1000</td>\n",
|
||
" <td>C123</td>\n",
|
||
" <td>S</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Allen, Mr. William Henry</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>373450</td>\n",
|
||
" <td>8.0500</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>887</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>Montvila, Rev. Juozas</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>211536</td>\n",
|
||
" <td>13.0000</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>888</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Graham, Miss. Margaret Edith</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>19.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>112053</td>\n",
|
||
" <td>30.0000</td>\n",
|
||
" <td>B42</td>\n",
|
||
" <td>S</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>889</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>W./C. 6607</td>\n",
|
||
" <td>23.4500</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>890</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Behr, Mr. Karl Howell</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>111369</td>\n",
|
||
" <td>30.0000</td>\n",
|
||
" <td>C148</td>\n",
|
||
" <td>C</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>891</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Dooley, Mr. Patrick</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>32.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>370376</td>\n",
|
||
" <td>7.7500</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Q</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>891 rows × 11 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Survived Pclass \\\n",
|
||
"PassengerId \n",
|
||
"1 0 3 \n",
|
||
"2 1 1 \n",
|
||
"3 1 3 \n",
|
||
"4 1 1 \n",
|
||
"5 0 3 \n",
|
||
"... ... ... \n",
|
||
"887 0 2 \n",
|
||
"888 1 1 \n",
|
||
"889 0 3 \n",
|
||
"890 1 1 \n",
|
||
"891 0 3 \n",
|
||
"\n",
|
||
" Name Sex Age \\\n",
|
||
"PassengerId \n",
|
||
"1 Braund, Mr. Owen Harris male 22.0 \n",
|
||
"2 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 \n",
|
||
"3 Heikkinen, Miss. Laina female 26.0 \n",
|
||
"4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 \n",
|
||
"5 Allen, Mr. William Henry male 35.0 \n",
|
||
"... ... ... ... \n",
|
||
"887 Montvila, Rev. Juozas male 27.0 \n",
|
||
"888 Graham, Miss. Margaret Edith female 19.0 \n",
|
||
"889 Johnston, Miss. Catherine Helen \"Carrie\" female NaN \n",
|
||
"890 Behr, Mr. Karl Howell male 26.0 \n",
|
||
"891 Dooley, Mr. Patrick male 32.0 \n",
|
||
"\n",
|
||
" SibSp Parch Ticket Fare Cabin Embarked \n",
|
||
"PassengerId \n",
|
||
"1 1 0 A/5 21171 7.2500 NaN S \n",
|
||
"2 1 0 PC 17599 71.2833 C85 C \n",
|
||
"3 0 0 STON/O2. 3101282 7.9250 NaN S \n",
|
||
"4 1 0 113803 53.1000 C123 S \n",
|
||
"5 0 0 373450 8.0500 NaN S \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"887 0 0 211536 13.0000 NaN S \n",
|
||
"888 0 0 112053 30.0000 B42 S \n",
|
||
"889 1 2 W./C. 6607 23.4500 NaN S \n",
|
||
"890 0 0 111369 30.0000 C148 C \n",
|
||
"891 0 0 370376 7.7500 NaN Q \n",
|
||
"\n",
|
||
"[891 rows x 11 columns]"
|
||
]
|
||
},
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"titanic = pd.read_csv(\"data/titanic.csv\", index_col=\"PassengerId\")\n",
|
||
"\n",
|
||
"titanic"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Унитарное кодирование\n",
|
||
"\n",
|
||
"Преобразование категориального признака в несколько бинарных признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Унитарное кодирование признаков Пол (Sex) и Порт посадки (Embarked)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Кодирование"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Embarked_Q</th>\n",
|
||
" <th>Embarked_S</th>\n",
|
||
" <th>Embarked_nan</th>\n",
|
||
" <th>Sex_male</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>886</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>887</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>888</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>889</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>890</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>891 rows × 4 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Embarked_Q Embarked_S Embarked_nan Sex_male\n",
|
||
"0 0.0 1.0 0.0 1.0\n",
|
||
"1 0.0 0.0 0.0 0.0\n",
|
||
"2 0.0 1.0 0.0 0.0\n",
|
||
"3 0.0 1.0 0.0 0.0\n",
|
||
"4 0.0 1.0 0.0 1.0\n",
|
||
".. ... ... ... ...\n",
|
||
"886 0.0 1.0 0.0 1.0\n",
|
||
"887 0.0 1.0 0.0 0.0\n",
|
||
"888 0.0 1.0 0.0 0.0\n",
|
||
"889 0.0 0.0 0.0 1.0\n",
|
||
"890 1.0 0.0 0.0 1.0\n",
|
||
"\n",
|
||
"[891 rows x 4 columns]"
|
||
]
|
||
},
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
|
||
"\n",
|
||
"encoded_values = encoder.fit_transform(titanic[[\"Embarked\", \"Sex\"]])\n",
|
||
"\n",
|
||
"encoded_columns = encoder.get_feature_names_out([\"Embarked\", \"Sex\"])\n",
|
||
"\n",
|
||
"encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
|
||
"\n",
|
||
"encoded_values_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Добавление признаков в исходный Dataframe"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Survived</th>\n",
|
||
" <th>Pclass</th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Sex</th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>SibSp</th>\n",
|
||
" <th>Parch</th>\n",
|
||
" <th>Ticket</th>\n",
|
||
" <th>Fare</th>\n",
|
||
" <th>Cabin</th>\n",
|
||
" <th>Embarked</th>\n",
|
||
" <th>Embarked_Q</th>\n",
|
||
" <th>Embarked_S</th>\n",
|
||
" <th>Embarked_nan</th>\n",
|
||
" <th>Sex_male</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>Braund, Mr. Owen Harris</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>A/5 21171</td>\n",
|
||
" <td>7.2500</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>PC 17599</td>\n",
|
||
" <td>71.2833</td>\n",
|
||
" <td>C85</td>\n",
|
||
" <td>C</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>Heikkinen, Miss. Laina</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>STON/O2. 3101282</td>\n",
|
||
" <td>7.9250</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>113803</td>\n",
|
||
" <td>53.1000</td>\n",
|
||
" <td>C123</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>Allen, Mr. William Henry</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>373450</td>\n",
|
||
" <td>8.0500</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>888</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Graham, Miss. Margaret Edith</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>19.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>112053</td>\n",
|
||
" <td>30.0000</td>\n",
|
||
" <td>B42</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>889</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>W./C. 6607</td>\n",
|
||
" <td>23.4500</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>890</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Behr, Mr. Karl Howell</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>111369</td>\n",
|
||
" <td>30.0000</td>\n",
|
||
" <td>C148</td>\n",
|
||
" <td>C</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>891</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>Dooley, Mr. Patrick</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>32.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>370376</td>\n",
|
||
" <td>7.7500</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Q</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>892 rows × 15 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Survived Pclass Name \\\n",
|
||
"1 0.0 3.0 Braund, Mr. Owen Harris \n",
|
||
"2 1.0 1.0 Cumings, Mrs. John Bradley (Florence Briggs Th... \n",
|
||
"3 1.0 3.0 Heikkinen, Miss. Laina \n",
|
||
"4 1.0 1.0 Futrelle, Mrs. Jacques Heath (Lily May Peel) \n",
|
||
"5 0.0 3.0 Allen, Mr. William Henry \n",
|
||
".. ... ... ... \n",
|
||
"888 1.0 1.0 Graham, Miss. Margaret Edith \n",
|
||
"889 0.0 3.0 Johnston, Miss. Catherine Helen \"Carrie\" \n",
|
||
"890 1.0 1.0 Behr, Mr. Karl Howell \n",
|
||
"891 0.0 3.0 Dooley, Mr. Patrick \n",
|
||
"0 NaN NaN NaN \n",
|
||
"\n",
|
||
" Sex Age SibSp Parch Ticket Fare Cabin Embarked \\\n",
|
||
"1 male 22.0 1.0 0.0 A/5 21171 7.2500 NaN S \n",
|
||
"2 female 38.0 1.0 0.0 PC 17599 71.2833 C85 C \n",
|
||
"3 female 26.0 0.0 0.0 STON/O2. 3101282 7.9250 NaN S \n",
|
||
"4 female 35.0 1.0 0.0 113803 53.1000 C123 S \n",
|
||
"5 male 35.0 0.0 0.0 373450 8.0500 NaN S \n",
|
||
".. ... ... ... ... ... ... ... ... \n",
|
||
"888 female 19.0 0.0 0.0 112053 30.0000 B42 S \n",
|
||
"889 female NaN 1.0 2.0 W./C. 6607 23.4500 NaN S \n",
|
||
"890 male 26.0 0.0 0.0 111369 30.0000 C148 C \n",
|
||
"891 male 32.0 0.0 0.0 370376 7.7500 NaN Q \n",
|
||
"0 NaN NaN NaN NaN NaN NaN NaN NaN \n",
|
||
"\n",
|
||
" Embarked_Q Embarked_S Embarked_nan Sex_male \n",
|
||
"1 0.0 0.0 0.0 0.0 \n",
|
||
"2 0.0 1.0 0.0 0.0 \n",
|
||
"3 0.0 1.0 0.0 0.0 \n",
|
||
"4 0.0 1.0 0.0 1.0 \n",
|
||
"5 1.0 0.0 0.0 1.0 \n",
|
||
".. ... ... ... ... \n",
|
||
"888 0.0 1.0 0.0 0.0 \n",
|
||
"889 0.0 0.0 0.0 1.0 \n",
|
||
"890 1.0 0.0 0.0 1.0 \n",
|
||
"891 NaN NaN NaN NaN \n",
|
||
"0 0.0 1.0 0.0 1.0 \n",
|
||
"\n",
|
||
"[892 rows x 15 columns]"
|
||
]
|
||
},
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"titanic = pd.concat([titanic, encoded_values_df], axis=1)\n",
|
||
"\n",
|
||
"titanic"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Дискретизация признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Равномерное разделение данных на 3 группы"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"labels = [\"young\", \"middle-aged\", \"old\"]\n",
|
||
"num_bins = 3"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(array([ 0.42 , 26.94666667, 53.47333333, 80. ]),\n",
|
||
" array([319, 523, 50]))"
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"hist1, bins1 = np.histogram(titanic[\"Age\"].fillna(titanic[\"Age\"].median()), bins=num_bins)\n",
|
||
"bins1, hist1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>Age</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>(0.42, 26.947]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>(26.947, 53.473]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>(0.42, 26.947]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>(26.947, 53.473]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>(26.947, 53.473]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>(53.473, 80.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>(0.42, 26.947]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>(26.947, 53.473]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>(0.42, 26.947]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>(0.42, 26.947]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>(53.473, 80.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>(0.42, 26.947]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>39.0</td>\n",
|
||
" <td>(26.947, 53.473]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>(0.42, 26.947]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>(53.473, 80.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>(0.42, 26.947]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>(26.947, 53.473]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Age Age\n",
|
||
"1 22.0 (0.42, 26.947]\n",
|
||
"2 38.0 (26.947, 53.473]\n",
|
||
"3 26.0 (0.42, 26.947]\n",
|
||
"4 35.0 (26.947, 53.473]\n",
|
||
"5 35.0 (26.947, 53.473]\n",
|
||
"6 NaN NaN\n",
|
||
"7 54.0 (53.473, 80.0]\n",
|
||
"8 2.0 (0.42, 26.947]\n",
|
||
"9 27.0 (26.947, 53.473]\n",
|
||
"10 14.0 (0.42, 26.947]\n",
|
||
"11 4.0 (0.42, 26.947]\n",
|
||
"12 58.0 (53.473, 80.0]\n",
|
||
"13 20.0 (0.42, 26.947]\n",
|
||
"14 39.0 (26.947, 53.473]\n",
|
||
"15 14.0 (0.42, 26.947]\n",
|
||
"16 55.0 (53.473, 80.0]\n",
|
||
"17 2.0 (0.42, 26.947]\n",
|
||
"18 NaN NaN\n",
|
||
"19 31.0 (26.947, 53.473]\n",
|
||
"20 NaN NaN"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins1))], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>Age</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>39.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Age Age\n",
|
||
"1 22.0 young\n",
|
||
"2 38.0 middle-aged\n",
|
||
"3 26.0 young\n",
|
||
"4 35.0 middle-aged\n",
|
||
"5 35.0 middle-aged\n",
|
||
"6 NaN NaN\n",
|
||
"7 54.0 old\n",
|
||
"8 2.0 young\n",
|
||
"9 27.0 middle-aged\n",
|
||
"10 14.0 young\n",
|
||
"11 4.0 young\n",
|
||
"12 58.0 old\n",
|
||
"13 20.0 young\n",
|
||
"14 39.0 middle-aged\n",
|
||
"15 14.0 young\n",
|
||
"16 55.0 old\n",
|
||
"17 2.0 young\n",
|
||
"18 NaN NaN\n",
|
||
"19 31.0 middle-aged\n",
|
||
"20 NaN NaN"
|
||
]
|
||
},
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins1), labels=labels)], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Равномерное разделение данных на 3 группы c установкой собственной границы диапазона значений (от 0 до 100)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(array([ 0. , 33.33333333, 66.66666667, 100. ]),\n",
|
||
" array([641, 244, 7]))"
|
||
]
|
||
},
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"bins2 = np.linspace(0, 100, 4)\n",
|
||
"tmp_bins2 = np.digitize(titanic[\"Age\"].fillna(titanic[\"Age\"].median()), bins2)\n",
|
||
"hist2 = np.bincount(tmp_bins2 - 1)\n",
|
||
"bins2, hist2"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>Age</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>(0.0, 33.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>(33.333, 66.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>(0.0, 33.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>(33.333, 66.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>(33.333, 66.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>(33.333, 66.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>(0.0, 33.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>(0.0, 33.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>(0.0, 33.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>(0.0, 33.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>(33.333, 66.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>(0.0, 33.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>39.0</td>\n",
|
||
" <td>(33.333, 66.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>(0.0, 33.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>(33.333, 66.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>(0.0, 33.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>(0.0, 33.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Age Age\n",
|
||
"1 22.0 (0.0, 33.333]\n",
|
||
"2 38.0 (33.333, 66.667]\n",
|
||
"3 26.0 (0.0, 33.333]\n",
|
||
"4 35.0 (33.333, 66.667]\n",
|
||
"5 35.0 (33.333, 66.667]\n",
|
||
"6 NaN NaN\n",
|
||
"7 54.0 (33.333, 66.667]\n",
|
||
"8 2.0 (0.0, 33.333]\n",
|
||
"9 27.0 (0.0, 33.333]\n",
|
||
"10 14.0 (0.0, 33.333]\n",
|
||
"11 4.0 (0.0, 33.333]\n",
|
||
"12 58.0 (33.333, 66.667]\n",
|
||
"13 20.0 (0.0, 33.333]\n",
|
||
"14 39.0 (33.333, 66.667]\n",
|
||
"15 14.0 (0.0, 33.333]\n",
|
||
"16 55.0 (33.333, 66.667]\n",
|
||
"17 2.0 (0.0, 33.333]\n",
|
||
"18 NaN NaN\n",
|
||
"19 31.0 (0.0, 33.333]\n",
|
||
"20 NaN NaN"
|
||
]
|
||
},
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins2))], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>Age</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>39.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Age Age\n",
|
||
"1 22.0 young\n",
|
||
"2 38.0 middle-aged\n",
|
||
"3 26.0 young\n",
|
||
"4 35.0 middle-aged\n",
|
||
"5 35.0 middle-aged\n",
|
||
"6 NaN NaN\n",
|
||
"7 54.0 middle-aged\n",
|
||
"8 2.0 young\n",
|
||
"9 27.0 young\n",
|
||
"10 14.0 young\n",
|
||
"11 4.0 young\n",
|
||
"12 58.0 middle-aged\n",
|
||
"13 20.0 young\n",
|
||
"14 39.0 middle-aged\n",
|
||
"15 14.0 young\n",
|
||
"16 55.0 middle-aged\n",
|
||
"17 2.0 young\n",
|
||
"18 NaN NaN\n",
|
||
"19 31.0 young\n",
|
||
"20 NaN NaN"
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins2), labels=labels)], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Равномерное разделение данных на 3 группы c установкой собственных интервалов (0 - 39, 40 - 60, 61 - 100)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(array([ 0, 40, 60, 100]), array([729, 137, 26]))"
|
||
]
|
||
},
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"hist3, bins3 = np.histogram(\n",
|
||
" titanic[\"Age\"].fillna(titanic[\"Age\"].median()), bins=[0, 40, 60, 100]\n",
|
||
")\n",
|
||
"bins3, hist3"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>Age</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>(40.0, 60.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>(40.0, 60.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>39.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>(40.0, 60.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Age Age\n",
|
||
"1 22.0 (0.0, 40.0]\n",
|
||
"2 38.0 (0.0, 40.0]\n",
|
||
"3 26.0 (0.0, 40.0]\n",
|
||
"4 35.0 (0.0, 40.0]\n",
|
||
"5 35.0 (0.0, 40.0]\n",
|
||
"6 NaN NaN\n",
|
||
"7 54.0 (40.0, 60.0]\n",
|
||
"8 2.0 (0.0, 40.0]\n",
|
||
"9 27.0 (0.0, 40.0]\n",
|
||
"10 14.0 (0.0, 40.0]\n",
|
||
"11 4.0 (0.0, 40.0]\n",
|
||
"12 58.0 (40.0, 60.0]\n",
|
||
"13 20.0 (0.0, 40.0]\n",
|
||
"14 39.0 (0.0, 40.0]\n",
|
||
"15 14.0 (0.0, 40.0]\n",
|
||
"16 55.0 (40.0, 60.0]\n",
|
||
"17 2.0 (0.0, 40.0]\n",
|
||
"18 NaN NaN\n",
|
||
"19 31.0 (0.0, 40.0]\n",
|
||
"20 NaN NaN"
|
||
]
|
||
},
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins3))], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>Age</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>39.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Age Age\n",
|
||
"1 22.0 young\n",
|
||
"2 38.0 young\n",
|
||
"3 26.0 young\n",
|
||
"4 35.0 young\n",
|
||
"5 35.0 young\n",
|
||
"6 NaN NaN\n",
|
||
"7 54.0 middle-aged\n",
|
||
"8 2.0 young\n",
|
||
"9 27.0 young\n",
|
||
"10 14.0 young\n",
|
||
"11 4.0 young\n",
|
||
"12 58.0 middle-aged\n",
|
||
"13 20.0 young\n",
|
||
"14 39.0 young\n",
|
||
"15 14.0 young\n",
|
||
"16 55.0 middle-aged\n",
|
||
"17 2.0 young\n",
|
||
"18 NaN NaN\n",
|
||
"19 31.0 young\n",
|
||
"20 NaN NaN"
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins3), labels=labels)], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Квантильное разделение данных на 3 группы"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>Age</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>39.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Age Age\n",
|
||
"1 22.0 0.0\n",
|
||
"2 38.0 2.0\n",
|
||
"3 26.0 1.0\n",
|
||
"4 35.0 2.0\n",
|
||
"5 35.0 2.0\n",
|
||
"6 NaN NaN\n",
|
||
"7 54.0 2.0\n",
|
||
"8 2.0 0.0\n",
|
||
"9 27.0 1.0\n",
|
||
"10 14.0 0.0\n",
|
||
"11 4.0 0.0\n",
|
||
"12 58.0 2.0\n",
|
||
"13 20.0 0.0\n",
|
||
"14 39.0 2.0\n",
|
||
"15 14.0 0.0\n",
|
||
"16 55.0 2.0\n",
|
||
"17 2.0 0.0\n",
|
||
"18 NaN NaN\n",
|
||
"19 31.0 1.0\n",
|
||
"20 NaN NaN"
|
||
]
|
||
},
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([titanic[\"Age\"], pd.qcut(titanic[\"Age\"], q=3, labels=False)], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>Age</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>39.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Age Age\n",
|
||
"1 22.0 young\n",
|
||
"2 38.0 old\n",
|
||
"3 26.0 middle-aged\n",
|
||
"4 35.0 old\n",
|
||
"5 35.0 old\n",
|
||
"6 NaN NaN\n",
|
||
"7 54.0 old\n",
|
||
"8 2.0 young\n",
|
||
"9 27.0 middle-aged\n",
|
||
"10 14.0 young\n",
|
||
"11 4.0 young\n",
|
||
"12 58.0 old\n",
|
||
"13 20.0 young\n",
|
||
"14 39.0 old\n",
|
||
"15 14.0 young\n",
|
||
"16 55.0 old\n",
|
||
"17 2.0 young\n",
|
||
"18 NaN NaN\n",
|
||
"19 31.0 middle-aged\n",
|
||
"20 NaN NaN"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([titanic[\"Age\"], pd.qcut(titanic[\"Age\"], q=3, labels=labels)], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Пример конструирования признаков на основе существующих\n",
|
||
"\n",
|
||
"Title - обращение к пассажиру (Mr, Mrs, Miss)\n",
|
||
"\n",
|
||
"Is_married - замужняя ли женщина\n",
|
||
"\n",
|
||
"Cabin_type - палуба (тип каюты)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Survived</th>\n",
|
||
" <th>Pclass</th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Sex</th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>SibSp</th>\n",
|
||
" <th>Parch</th>\n",
|
||
" <th>Ticket</th>\n",
|
||
" <th>Fare</th>\n",
|
||
" <th>Cabin</th>\n",
|
||
" <th>Embarked</th>\n",
|
||
" <th>Title</th>\n",
|
||
" <th>Is_married</th>\n",
|
||
" <th>Cabin_type</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>PC 17599</td>\n",
|
||
" <td>71.2833</td>\n",
|
||
" <td>C85</td>\n",
|
||
" <td>C</td>\n",
|
||
" <td>Mrs</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>C</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>113803</td>\n",
|
||
" <td>53.1000</td>\n",
|
||
" <td>C123</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>Mrs</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>C</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>McCarthy, Mr. Timothy J</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>17463</td>\n",
|
||
" <td>51.8625</td>\n",
|
||
" <td>E46</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>Mr</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>E</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>Sandstrom, Miss. Marguerite Rut</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>PP 9549</td>\n",
|
||
" <td>16.7000</td>\n",
|
||
" <td>G6</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>Miss</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>G</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Bonnell, Miss. Elizabeth</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>113783</td>\n",
|
||
" <td>26.5500</td>\n",
|
||
" <td>C103</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>Miss</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>C</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>872</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Beckwith, Mrs. Richard Leonard (Sallie Monypeny)</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>47.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>11751</td>\n",
|
||
" <td>52.5542</td>\n",
|
||
" <td>D35</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>Mrs</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>D</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>873</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Carlsson, Mr. Frans Olof</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>33.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>695</td>\n",
|
||
" <td>5.0000</td>\n",
|
||
" <td>B51 B53 B55</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>Mr</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>B</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>880</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>56.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>11767</td>\n",
|
||
" <td>83.1583</td>\n",
|
||
" <td>C50</td>\n",
|
||
" <td>C</td>\n",
|
||
" <td>Mrs</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>C</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>888</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Graham, Miss. Margaret Edith</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>19.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>112053</td>\n",
|
||
" <td>30.0000</td>\n",
|
||
" <td>B42</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>Miss</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>B</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>890</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Behr, Mr. Karl Howell</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>111369</td>\n",
|
||
" <td>30.0000</td>\n",
|
||
" <td>C148</td>\n",
|
||
" <td>C</td>\n",
|
||
" <td>Mr</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>C</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>183 rows × 14 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Survived Pclass Name \\\n",
|
||
"2 1.0 1.0 Cumings, Mrs. John Bradley (Florence Briggs Th... \n",
|
||
"4 1.0 1.0 Futrelle, Mrs. Jacques Heath (Lily May Peel) \n",
|
||
"7 0.0 1.0 McCarthy, Mr. Timothy J \n",
|
||
"11 1.0 3.0 Sandstrom, Miss. Marguerite Rut \n",
|
||
"12 1.0 1.0 Bonnell, Miss. Elizabeth \n",
|
||
".. ... ... ... \n",
|
||
"872 1.0 1.0 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) \n",
|
||
"873 0.0 1.0 Carlsson, Mr. Frans Olof \n",
|
||
"880 1.0 1.0 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) \n",
|
||
"888 1.0 1.0 Graham, Miss. Margaret Edith \n",
|
||
"890 1.0 1.0 Behr, Mr. Karl Howell \n",
|
||
"\n",
|
||
" Sex Age SibSp Parch Ticket Fare Cabin Embarked \\\n",
|
||
"2 female 38.0 1.0 0.0 PC 17599 71.2833 C85 C \n",
|
||
"4 female 35.0 1.0 0.0 113803 53.1000 C123 S \n",
|
||
"7 male 54.0 0.0 0.0 17463 51.8625 E46 S \n",
|
||
"11 female 4.0 1.0 1.0 PP 9549 16.7000 G6 S \n",
|
||
"12 female 58.0 0.0 0.0 113783 26.5500 C103 S \n",
|
||
".. ... ... ... ... ... ... ... ... \n",
|
||
"872 female 47.0 1.0 1.0 11751 52.5542 D35 S \n",
|
||
"873 male 33.0 0.0 0.0 695 5.0000 B51 B53 B55 S \n",
|
||
"880 female 56.0 0.0 1.0 11767 83.1583 C50 C \n",
|
||
"888 female 19.0 0.0 0.0 112053 30.0000 B42 S \n",
|
||
"890 male 26.0 0.0 0.0 111369 30.0000 C148 C \n",
|
||
"\n",
|
||
" Title Is_married Cabin_type \n",
|
||
"2 Mrs 1 C \n",
|
||
"4 Mrs 1 C \n",
|
||
"7 Mr 0 E \n",
|
||
"11 Miss 0 G \n",
|
||
"12 Miss 0 C \n",
|
||
".. ... ... ... \n",
|
||
"872 Mrs 1 D \n",
|
||
"873 Mr 0 B \n",
|
||
"880 Mrs 1 C \n",
|
||
"888 Miss 0 B \n",
|
||
"890 Mr 0 C \n",
|
||
"\n",
|
||
"[183 rows x 14 columns]"
|
||
]
|
||
},
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"titanic_cl = titanic.drop(\n",
|
||
" [\"Embarked_Q\", \"Embarked_S\", \"Embarked_nan\", \"Sex_male\"], axis=1, errors=\"ignore\"\n",
|
||
")\n",
|
||
"titanic_cl = titanic_cl.dropna()\n",
|
||
"\n",
|
||
"titanic_cl[\"Title\"] = [\n",
|
||
" i.split(\",\")[1].split(\".\")[0].strip() for i in titanic_cl[\"Name\"]\n",
|
||
"]\n",
|
||
"\n",
|
||
"titanic_cl[\"Is_married\"] = [1 if i == \"Mrs\" else 0 for i in titanic_cl[\"Title\"]]\n",
|
||
"\n",
|
||
"titanic_cl[\"Cabin_type\"] = [i[0] for i in titanic_cl[\"Cabin\"]]\n",
|
||
"\n",
|
||
"titanic_cl"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Пример использования библиотеки Featuretools для автоматического конструирования (синтеза) признаков\n",
|
||
"\n",
|
||
"https://featuretools.alteryx.com/en/stable/getting_started/using_entitysets.html"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Загрузка данных\n",
|
||
"\n",
|
||
"За основу был взят набор данных \"Ecommerce Orders Data Set\" из Kaggle\n",
|
||
"\n",
|
||
"Используется только 100 первых заказов и связанные с ними объекты\n",
|
||
"\n",
|
||
"https://www.kaggle.com/datasets/sangamsharmait/ecommerce-orders-data-analysis"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import featuretools as ft\n",
|
||
"from woodwork.logical_types import Categorical, Datetime\n",
|
||
"\n",
|
||
"customers = pd.read_csv(\"data/orders/customers.csv\")\n",
|
||
"sellers = pd.read_csv(\"data/orders/sellers.csv\")\n",
|
||
"products = pd.read_csv(\"data/orders/products.csv\")\n",
|
||
"orders = pd.read_csv(\"data/orders/orders.csv\")\n",
|
||
"orders.fillna({\"order_delivered_carrier_date\": pd.to_datetime(\n",
|
||
" \"1900-01-01 00:00:00\"\n",
|
||
")}, inplace=True)\n",
|
||
"orders.fillna(\n",
|
||
" {\"order_delivered_customer_date\": pd.to_datetime(\"1900-01-01 00:00:00\")},\n",
|
||
" inplace=True,\n",
|
||
")\n",
|
||
"order_items = pd.read_csv(\"data/orders/order_items.csv\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Создание сущностей в featuretools\n",
|
||
"\n",
|
||
"Добавление dataframe'ов с данными в EntitySet с указанием параметров: название сущности (таблицы), первичный ключ, категориальные атрибуты (в том числе даты)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Entityset: orders\n",
|
||
" DataFrames:\n",
|
||
" customers [Rows: 100, Columns: 5]\n",
|
||
" sellers [Rows: 87, Columns: 4]\n",
|
||
" products [Rows: 100, Columns: 9]\n",
|
||
" orders [Rows: 100, Columns: 8]\n",
|
||
" order_items [Rows: 115, Columns: 8]\n",
|
||
" Relationships:\n",
|
||
" No relationships"
|
||
]
|
||
},
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"es = ft.EntitySet(id=\"orders\")\n",
|
||
"\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"customers\",\n",
|
||
" dataframe=customers,\n",
|
||
" index=\"customer_id\",\n",
|
||
" logical_types={\n",
|
||
" \"customer_unique_id\": Categorical,\n",
|
||
" \"customer_zip_code_prefix\": Categorical,\n",
|
||
" \"customer_city\": Categorical,\n",
|
||
" \"customer_state\": Categorical,\n",
|
||
" },\n",
|
||
")\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"sellers\",\n",
|
||
" dataframe=sellers,\n",
|
||
" index=\"seller_id\",\n",
|
||
" logical_types={\n",
|
||
" \"seller_zip_code_prefix\": Categorical,\n",
|
||
" \"seller_city\": Categorical,\n",
|
||
" \"seller_state\": Categorical,\n",
|
||
" },\n",
|
||
")\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"products\",\n",
|
||
" dataframe=products,\n",
|
||
" index=\"product_id\",\n",
|
||
" logical_types={\n",
|
||
" \"product_category_name\": Categorical,\n",
|
||
" \"product_name_lenght\": Categorical,\n",
|
||
" \"product_description_lenght\": Categorical,\n",
|
||
" \"product_photos_qty\": Categorical,\n",
|
||
" },\n",
|
||
")\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"orders\",\n",
|
||
" dataframe=orders,\n",
|
||
" index=\"order_id\",\n",
|
||
" logical_types={\n",
|
||
" \"order_status\": Categorical,\n",
|
||
" \"order_purchase_timestamp\": Datetime,\n",
|
||
" \"order_approved_at\": Datetime,\n",
|
||
" \"order_delivered_carrier_date\": Datetime,\n",
|
||
" \"order_delivered_customer_date\": Datetime,\n",
|
||
" \"order_estimated_delivery_date\": Datetime,\n",
|
||
" },\n",
|
||
")\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"order_items\",\n",
|
||
" dataframe=order_items,\n",
|
||
" index=\"orderitem_id\",\n",
|
||
" make_index=True,\n",
|
||
" logical_types={\"shipping_limit_date\": Datetime},\n",
|
||
")\n",
|
||
"\n",
|
||
"es"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Настройка связей между сущностями featuretools\n",
|
||
"\n",
|
||
"Настройка связей между таблицами на уровне ключей\n",
|
||
"\n",
|
||
"Связь указывается от родителя к потомкам (таблица-родитель, первичный ключ, таблица-потомок, внешний ключ)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Entityset: orders\n",
|
||
" DataFrames:\n",
|
||
" customers [Rows: 100, Columns: 5]\n",
|
||
" sellers [Rows: 87, Columns: 4]\n",
|
||
" products [Rows: 100, Columns: 9]\n",
|
||
" orders [Rows: 100, Columns: 8]\n",
|
||
" order_items [Rows: 115, Columns: 8]\n",
|
||
" Relationships:\n",
|
||
" orders.customer_id -> customers.customer_id\n",
|
||
" order_items.order_id -> orders.order_id\n",
|
||
" order_items.product_id -> products.product_id\n",
|
||
" order_items.seller_id -> sellers.seller_id"
|
||
]
|
||
},
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"es = es.add_relationship(\"customers\", \"customer_id\", \"orders\", \"customer_id\")\n",
|
||
"es = es.add_relationship(\"orders\", \"order_id\", \"order_items\", \"order_id\")\n",
|
||
"es = es.add_relationship(\"products\", \"product_id\", \"order_items\", \"product_id\")\n",
|
||
"es = es.add_relationship(\"sellers\", \"seller_id\", \"order_items\", \"seller_id\")\n",
|
||
"\n",
|
||
"es"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Автоматическое конструирование признаков с помощью featuretools\n",
|
||
"\n",
|
||
"Библиотека применят различные функции агрегации и трансформации к атрибутам таблицы order_items с учетом отношений\n",
|
||
"\n",
|
||
"Результат помещается в Dataframe feature_matrix"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\featuretools\\synthesis\\dfs.py:321: UnusedPrimitiveWarning: Some specified primitives were not used during DFS:\n",
|
||
" agg_primitives: ['any', 'mode']\n",
|
||
"This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.\n",
|
||
" warnings.warn(warning_msg, UnusedPrimitiveWarning)\n",
|
||
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function mean at 0x00000245E1C73EC0> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
|
||
" ).agg(to_agg)\n",
|
||
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function mean at 0x00000245E1C73EC0> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
|
||
" ).agg(to_agg)\n",
|
||
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function mean at 0x00000245E1C73EC0> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
|
||
" ).agg(to_agg)\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>order_item_id</th>\n",
|
||
" <th>price</th>\n",
|
||
" <th>freight_value</th>\n",
|
||
" <th>HOUR(shipping_limit_date)</th>\n",
|
||
" <th>WEEKDAY(shipping_limit_date)</th>\n",
|
||
" <th>orders.order_status</th>\n",
|
||
" <th>products.product_category_name</th>\n",
|
||
" <th>products.product_name_lenght</th>\n",
|
||
" <th>products.product_description_lenght</th>\n",
|
||
" <th>products.product_photos_qty</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>orders.customers.customer_city</th>\n",
|
||
" <th>orders.customers.customer_state</th>\n",
|
||
" <th>products.COUNT(order_items)</th>\n",
|
||
" <th>products.MEAN(order_items.freight_value)</th>\n",
|
||
" <th>products.MEAN(order_items.order_item_id)</th>\n",
|
||
" <th>products.MEAN(order_items.price)</th>\n",
|
||
" <th>sellers.COUNT(order_items)</th>\n",
|
||
" <th>sellers.MEAN(order_items.freight_value)</th>\n",
|
||
" <th>sellers.MEAN(order_items.order_item_id)</th>\n",
|
||
" <th>sellers.MEAN(order_items.price)</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>orderitem_id</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>38.50</td>\n",
|
||
" <td>24.84</td>\n",
|
||
" <td>20</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>delivered</td>\n",
|
||
" <td>cama_mesa_banho</td>\n",
|
||
" <td>53.0</td>\n",
|
||
" <td>223.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>santa luzia</td>\n",
|
||
" <td>PB</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>24.84</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>38.50</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>21.340</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>61.200000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>29.99</td>\n",
|
||
" <td>7.39</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>delivered</td>\n",
|
||
" <td>telefonia</td>\n",
|
||
" <td>59.0</td>\n",
|
||
" <td>675.0</td>\n",
|
||
" <td>5.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>sao paulo</td>\n",
|
||
" <td>SP</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>7.39</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>29.99</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>7.390</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>29.990000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>110.99</td>\n",
|
||
" <td>21.27</td>\n",
|
||
" <td>21</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>delivered</td>\n",
|
||
" <td>cama_mesa_banho</td>\n",
|
||
" <td>52.0</td>\n",
|
||
" <td>413.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>gravatai</td>\n",
|
||
" <td>RS</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>21.27</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>110.99</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>21.270</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>110.990000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>27.99</td>\n",
|
||
" <td>15.10</td>\n",
|
||
" <td>23</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>delivered</td>\n",
|
||
" <td>telefonia</td>\n",
|
||
" <td>60.0</td>\n",
|
||
" <td>818.0</td>\n",
|
||
" <td>6.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>imbituba</td>\n",
|
||
" <td>SC</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>15.10</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>27.99</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>13.970</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>26.490000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>49.90</td>\n",
|
||
" <td>16.05</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>invoiced</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>santa rosa</td>\n",
|
||
" <td>RS</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>16.05</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>49.90</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>16.050</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>49.900000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>110</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>17.90</td>\n",
|
||
" <td>10.96</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>delivered</td>\n",
|
||
" <td>cama_mesa_banho</td>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>122.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>jundiai</td>\n",
|
||
" <td>SP</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>10.96</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>17.90</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>10.960</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>17.900000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>111</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>79.99</td>\n",
|
||
" <td>8.91</td>\n",
|
||
" <td>9</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>delivered</td>\n",
|
||
" <td>beleza_saude</td>\n",
|
||
" <td>59.0</td>\n",
|
||
" <td>492.0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>sao paulo</td>\n",
|
||
" <td>SP</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>8.91</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>79.99</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>13.206</td>\n",
|
||
" <td>1.2</td>\n",
|
||
" <td>54.590000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>112</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>190.00</td>\n",
|
||
" <td>19.41</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>delivered</td>\n",
|
||
" <td>climatizacao</td>\n",
|
||
" <td>60.0</td>\n",
|
||
" <td>3270.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>paulinia</td>\n",
|
||
" <td>SP</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>19.41</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>190.00</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>19.410</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>190.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>113</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>109.90</td>\n",
|
||
" <td>15.53</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>delivered</td>\n",
|
||
" <td>cool_stuff</td>\n",
|
||
" <td>46.0</td>\n",
|
||
" <td>595.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>rio de janeiro</td>\n",
|
||
" <td>RJ</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>15.53</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>109.90</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>15.530</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>109.900000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>114</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>27.90</td>\n",
|
||
" <td>18.30</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>delivered</td>\n",
|
||
" <td>alimentos</td>\n",
|
||
" <td>59.0</td>\n",
|
||
" <td>982.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>joinville</td>\n",
|
||
" <td>SC</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>16.70</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>27.90</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>16.190</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>38.596667</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>115 rows × 43 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" order_item_id price freight_value HOUR(shipping_limit_date) \\\n",
|
||
"orderitem_id \n",
|
||
"0 1 38.50 24.84 20 \n",
|
||
"1 1 29.99 7.39 8 \n",
|
||
"2 1 110.99 21.27 21 \n",
|
||
"3 1 27.99 15.10 23 \n",
|
||
"4 1 49.90 16.05 13 \n",
|
||
"... ... ... ... ... \n",
|
||
"110 1 17.90 10.96 8 \n",
|
||
"111 1 79.99 8.91 9 \n",
|
||
"112 1 190.00 19.41 13 \n",
|
||
"113 1 109.90 15.53 2 \n",
|
||
"114 1 27.90 18.30 14 \n",
|
||
"\n",
|
||
" WEEKDAY(shipping_limit_date) orders.order_status \\\n",
|
||
"orderitem_id \n",
|
||
"0 4 delivered \n",
|
||
"1 0 delivered \n",
|
||
"2 1 delivered \n",
|
||
"3 1 delivered \n",
|
||
"4 2 invoiced \n",
|
||
"... ... ... \n",
|
||
"110 1 delivered \n",
|
||
"111 4 delivered \n",
|
||
"112 3 delivered \n",
|
||
"113 2 delivered \n",
|
||
"114 2 delivered \n",
|
||
"\n",
|
||
" products.product_category_name products.product_name_lenght \\\n",
|
||
"orderitem_id \n",
|
||
"0 cama_mesa_banho 53.0 \n",
|
||
"1 telefonia 59.0 \n",
|
||
"2 cama_mesa_banho 52.0 \n",
|
||
"3 telefonia 60.0 \n",
|
||
"4 NaN NaN \n",
|
||
"... ... ... \n",
|
||
"110 cama_mesa_banho 55.0 \n",
|
||
"111 beleza_saude 59.0 \n",
|
||
"112 climatizacao 60.0 \n",
|
||
"113 cool_stuff 46.0 \n",
|
||
"114 alimentos 59.0 \n",
|
||
"\n",
|
||
" products.product_description_lenght products.product_photos_qty \\\n",
|
||
"orderitem_id \n",
|
||
"0 223.0 1.0 \n",
|
||
"1 675.0 5.0 \n",
|
||
"2 413.0 1.0 \n",
|
||
"3 818.0 6.0 \n",
|
||
"4 NaN NaN \n",
|
||
"... ... ... \n",
|
||
"110 122.0 1.0 \n",
|
||
"111 492.0 3.0 \n",
|
||
"112 3270.0 4.0 \n",
|
||
"113 595.0 2.0 \n",
|
||
"114 982.0 1.0 \n",
|
||
"\n",
|
||
" ... orders.customers.customer_city \\\n",
|
||
"orderitem_id ... \n",
|
||
"0 ... santa luzia \n",
|
||
"1 ... sao paulo \n",
|
||
"2 ... gravatai \n",
|
||
"3 ... imbituba \n",
|
||
"4 ... santa rosa \n",
|
||
"... ... ... \n",
|
||
"110 ... jundiai \n",
|
||
"111 ... sao paulo \n",
|
||
"112 ... paulinia \n",
|
||
"113 ... rio de janeiro \n",
|
||
"114 ... joinville \n",
|
||
"\n",
|
||
" orders.customers.customer_state products.COUNT(order_items) \\\n",
|
||
"orderitem_id \n",
|
||
"0 PB 1 \n",
|
||
"1 SP 1 \n",
|
||
"2 RS 1 \n",
|
||
"3 SC 1 \n",
|
||
"4 RS 1 \n",
|
||
"... ... ... \n",
|
||
"110 SP 1 \n",
|
||
"111 SP 1 \n",
|
||
"112 SP 1 \n",
|
||
"113 RJ 1 \n",
|
||
"114 SC 2 \n",
|
||
"\n",
|
||
" products.MEAN(order_items.freight_value) \\\n",
|
||
"orderitem_id \n",
|
||
"0 24.84 \n",
|
||
"1 7.39 \n",
|
||
"2 21.27 \n",
|
||
"3 15.10 \n",
|
||
"4 16.05 \n",
|
||
"... ... \n",
|
||
"110 10.96 \n",
|
||
"111 8.91 \n",
|
||
"112 19.41 \n",
|
||
"113 15.53 \n",
|
||
"114 16.70 \n",
|
||
"\n",
|
||
" products.MEAN(order_items.order_item_id) \\\n",
|
||
"orderitem_id \n",
|
||
"0 1.0 \n",
|
||
"1 1.0 \n",
|
||
"2 1.0 \n",
|
||
"3 1.0 \n",
|
||
"4 1.0 \n",
|
||
"... ... \n",
|
||
"110 1.0 \n",
|
||
"111 1.0 \n",
|
||
"112 1.0 \n",
|
||
"113 1.0 \n",
|
||
"114 1.0 \n",
|
||
"\n",
|
||
" products.MEAN(order_items.price) sellers.COUNT(order_items) \\\n",
|
||
"orderitem_id \n",
|
||
"0 38.50 2 \n",
|
||
"1 29.99 1 \n",
|
||
"2 110.99 1 \n",
|
||
"3 27.99 2 \n",
|
||
"4 49.90 1 \n",
|
||
"... ... ... \n",
|
||
"110 17.90 1 \n",
|
||
"111 79.99 5 \n",
|
||
"112 190.00 1 \n",
|
||
"113 109.90 1 \n",
|
||
"114 27.90 3 \n",
|
||
"\n",
|
||
" sellers.MEAN(order_items.freight_value) \\\n",
|
||
"orderitem_id \n",
|
||
"0 21.340 \n",
|
||
"1 7.390 \n",
|
||
"2 21.270 \n",
|
||
"3 13.970 \n",
|
||
"4 16.050 \n",
|
||
"... ... \n",
|
||
"110 10.960 \n",
|
||
"111 13.206 \n",
|
||
"112 19.410 \n",
|
||
"113 15.530 \n",
|
||
"114 16.190 \n",
|
||
"\n",
|
||
" sellers.MEAN(order_items.order_item_id) \\\n",
|
||
"orderitem_id \n",
|
||
"0 1.0 \n",
|
||
"1 1.0 \n",
|
||
"2 1.0 \n",
|
||
"3 1.0 \n",
|
||
"4 1.0 \n",
|
||
"... ... \n",
|
||
"110 1.0 \n",
|
||
"111 1.2 \n",
|
||
"112 1.0 \n",
|
||
"113 1.0 \n",
|
||
"114 1.0 \n",
|
||
"\n",
|
||
" sellers.MEAN(order_items.price) \n",
|
||
"orderitem_id \n",
|
||
"0 61.200000 \n",
|
||
"1 29.990000 \n",
|
||
"2 110.990000 \n",
|
||
"3 26.490000 \n",
|
||
"4 49.900000 \n",
|
||
"... ... \n",
|
||
"110 17.900000 \n",
|
||
"111 54.590000 \n",
|
||
"112 190.000000 \n",
|
||
"113 109.900000 \n",
|
||
"114 38.596667 \n",
|
||
"\n",
|
||
"[115 rows x 43 columns]"
|
||
]
|
||
},
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"feature_matrix, feature_defs = ft.dfs(\n",
|
||
" entityset=es,\n",
|
||
" target_dataframe_name=\"order_items\",\n",
|
||
" agg_primitives=[\"mean\", \"count\", \"mode\", \"any\"],\n",
|
||
" trans_primitives=[\"hour\", \"weekday\"],\n",
|
||
" max_depth=2,\n",
|
||
")\n",
|
||
"\n",
|
||
"feature_matrix"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Полученные признаки\n",
|
||
"\n",
|
||
"Список колонок полученного dataframe'а"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"[<Feature: order_item_id>,\n",
|
||
" <Feature: price>,\n",
|
||
" <Feature: freight_value>,\n",
|
||
" <Feature: HOUR(shipping_limit_date)>,\n",
|
||
" <Feature: WEEKDAY(shipping_limit_date)>,\n",
|
||
" <Feature: orders.order_status>,\n",
|
||
" <Feature: products.product_category_name>,\n",
|
||
" <Feature: products.product_name_lenght>,\n",
|
||
" <Feature: products.product_description_lenght>,\n",
|
||
" <Feature: products.product_photos_qty>,\n",
|
||
" <Feature: products.product_weight_g>,\n",
|
||
" <Feature: products.product_length_cm>,\n",
|
||
" <Feature: products.product_height_cm>,\n",
|
||
" <Feature: products.product_width_cm>,\n",
|
||
" <Feature: sellers.seller_zip_code_prefix>,\n",
|
||
" <Feature: sellers.seller_city>,\n",
|
||
" <Feature: sellers.seller_state>,\n",
|
||
" <Feature: orders.COUNT(order_items)>,\n",
|
||
" <Feature: orders.MEAN(order_items.freight_value)>,\n",
|
||
" <Feature: orders.MEAN(order_items.order_item_id)>,\n",
|
||
" <Feature: orders.MEAN(order_items.price)>,\n",
|
||
" <Feature: orders.HOUR(order_approved_at)>,\n",
|
||
" <Feature: orders.HOUR(order_delivered_carrier_date)>,\n",
|
||
" <Feature: orders.HOUR(order_delivered_customer_date)>,\n",
|
||
" <Feature: orders.HOUR(order_estimated_delivery_date)>,\n",
|
||
" <Feature: orders.HOUR(order_purchase_timestamp)>,\n",
|
||
" <Feature: orders.WEEKDAY(order_approved_at)>,\n",
|
||
" <Feature: orders.WEEKDAY(order_delivered_carrier_date)>,\n",
|
||
" <Feature: orders.WEEKDAY(order_delivered_customer_date)>,\n",
|
||
" <Feature: orders.WEEKDAY(order_estimated_delivery_date)>,\n",
|
||
" <Feature: orders.WEEKDAY(order_purchase_timestamp)>,\n",
|
||
" <Feature: orders.customers.customer_unique_id>,\n",
|
||
" <Feature: orders.customers.customer_zip_code_prefix>,\n",
|
||
" <Feature: orders.customers.customer_city>,\n",
|
||
" <Feature: orders.customers.customer_state>,\n",
|
||
" <Feature: products.COUNT(order_items)>,\n",
|
||
" <Feature: products.MEAN(order_items.freight_value)>,\n",
|
||
" <Feature: products.MEAN(order_items.order_item_id)>,\n",
|
||
" <Feature: products.MEAN(order_items.price)>,\n",
|
||
" <Feature: sellers.COUNT(order_items)>,\n",
|
||
" <Feature: sellers.MEAN(order_items.freight_value)>,\n",
|
||
" <Feature: sellers.MEAN(order_items.order_item_id)>,\n",
|
||
" <Feature: sellers.MEAN(order_items.price)>]"
|
||
]
|
||
},
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"feature_defs"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Отсечение значений признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Определение выбросов с помощью boxplot"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<Axes: >"
|
||
]
|
||
},
|
||
"execution_count": 22,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAh8AAAGdCAYAAACyzRGfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/GU6VOAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAqrElEQVR4nO3dfXRU9YH/8U9IJpMnEkqUSVISyI8ooRqkIiWpAkLzsEg5ZglF+dnf0hbXnpaHhqCUuIrGBaNYHo7IQ92ThdUWVNjIFuQhWdQQaoIQC4VqI9gIKCQ+JgMJmUwm8/vDZbYRVCaZfCcP79c5OcP93u/c+cA5N/Ph3jtzA9xut1sAAACG9PN3AAAA0LdQPgAAgFGUDwAAYBTlAwAAGEX5AAAARlE+AACAUZQPAABgFOUDAAAYFeTvAF/W1tams2fPqn///goICPB3HAAAcBXcbrfOnz+vuLg49ev39cc2ul35OHv2rOLj4/0dAwAAdMCZM2c0ePDgr53T7cpH//79JX0RPjIy0s9pAPiS0+lUSUmJMjMzZbFY/B0HgA/Z7XbFx8d73se/TrcrH5dOtURGRlI+gF7G6XQqLCxMkZGRlA+gl7qaSya44BQAABhF+QAAAEZRPgAAgFGUDwAAYBTlAwAAGEX5AAAARlE+AACAUZQPAABgFOUDgBEul0tlZWXav3+/ysrK5HK5/B0JgJ94VT5cLpcefvhhJSYmKjQ0VMOGDdO//uu/yu12e+a43W4tWbJEsbGxCg0NVXp6uk6cOOHz4AB6juLiYiUlJSkjI0MrV65URkaGkpKSVFxc7O9oAPzAq/Lx5JNPav369XrmmWf0zjvv6Mknn9Ty5cu1Zs0az5zly5fr6aef1oYNG3Tw4EGFh4crKytLzc3NPg8PoPsrLi7W9OnTlZKSovLycm3ZskXl5eVKSUnR9OnTKSBAHxTg/vvDFt/ghz/8oWw2m4qKijxjOTk5Cg0N1e9+9zu53W7FxcVp4cKFuv/++yVJDQ0Nstls2rRpk+6+++5vfA273a6oqCg1NDRwbxegh3O5XEpKSlJKSoq2b98ul8ulXbt26Y477lBgYKCys7N1/PhxnThxQoGBgf6OC6ATvHn/9urGct///vf17LPP6t1339X111+vo0eP6sCBA1q5cqUkqaamRrW1tUpPT/c8JyoqSmPHjlVFRcUVy4fD4ZDD4WgXXvriBlROp9ObeAC6mbKyMr3//vt6/vnn5XK5PPv0pccHHnhA48eP12uvvaYJEyb4MyqATvLmPdur8rF48WLZ7XYlJycrMDBQLpdLy5Yt0z333CNJqq2tlSTZbLZ2z7PZbJ51X1ZYWKiCgoLLxktKShQWFuZNPADdzP79+yVJH3zwgT799FPPeGlpqSTp4sWLkqTdu3ersbHRfEAAPtPU1HTVc70qHy+99JJ+//vfa/Pmzbrhhht05MgR5ebmKi4uTrNmzfI6qCTl5+crLy/Ps2y32xUfH6/MzExOuwA9XHh4uFauXKnBgwdr7NixcjqdKi0tVUZGhiwWiyorKyVJkydP5sgH0MNdOnNxNbwqHw888IAWL17sOX2SkpKiU6dOqbCwULNmzVJMTIwkqa6uTrGxsZ7n1dXVadSoUVfcptVqldVqvWzcYrHIYrF4Ew9ANzNx4kQNHTpUy5cv1/bt2z3jFotFgYGBeuqpp5SYmKiJEydyzQfQw3nznu3Vp12amprUr1/7pwQGBqqtrU2SlJiYqJiYGO3bt8+z3m636+DBg0pLS/PmpQD0AoGBgVqxYoV27typ7OxsVVZW6uLFi6qsrFR2drZ27typ3/zmNxQPoI/x6sjH1KlTtWzZMiUkJOiGG27Qn/70J61cuVI/+9nPJEkBAQHKzc3V0qVLdd111ykxMVEPP/yw4uLilJ2d3RX5AXRz06ZN07Zt27Rw4UKNHz/eM56YmKht27Zp2rRpfkwHwB+8+qjt+fPn9fDDD+vll1/WRx99pLi4OM2cOVNLlixRcHCwpC++ZOyRRx7Rs88+q/r6et12221at26drr/++qt6DT5qC/ROLpdLr732mnbv3q3JkydzqgXoZbx5//aqfJhA+QB6L6fT6fmeD67pAnoXb96/ubcLAAAwivIBAACMonwAAACjKB8AAMAoygcAADCK8gEAAIyifAAAAKMoHwAAwCjKBwAAMIryAQAAjKJ8AAAAoygfAADAKMoHAAAwivIBAACMonwAMMLlcqmsrEz79+9XWVmZXC6XvyMB8BPKB4AuV1xcrKSkJGVkZGjlypXKyMhQUlKSiouL/R0NgB9QPgB0qeLiYk2fPl0pKSkqLy/Xli1bVF5erpSUFE2fPp0CAvRBAW632+3vEH/PbrcrKipKDQ0NioyM9HccAJ3gcrmUlJSklJQUbd++XS6XS7t27dIdd9yhwMBAZWdn6/jx4zpx4oQCAwP9HRdAJ3jz/s2RDwBdpry8XO+//74efPBB9evX/tdNv379lJ+fr5qaGpWXl/spIQB/oHwA6DLnzp2TJN14441XXH9p/NI8AH0D5QNAl4mNjZUkHT9+/IrrL41fmgegb6B8AOgy48aN09ChQ/X444+rra2t3bq2tjYVFhYqMTFR48aN81NCAP5A+QDQZQIDA7VixQrt3LlT2dnZqqys1MWLF1VZWans7Gzt3LlTv/nNb7jYFOhjgvwdAEDvNm3aNG3btk0LFy7U+PHjPeOJiYnatm2bpk2b5sd0APyBj9oCMMLlcum1117T7t27NXnyZE2cOJEjHkAv4s37N0c+ABgRGBioCRMmqLGxURMmTKB4AH0Y5QOAES0tLVqzZo1effVVnTx5UvPmzVNwcLC/YwHwAy44BdDlFi1apPDwcN1///3atWuX7r//foWHh2vRokX+jgbADzjyAaBLLVq0SE899ZRsNpsKCgpktVrlcDj0yCOP6KmnnpIkLV++3M8pAZjEkQ8AXaalpUWrVq2SzWbTqVOnNGzYMB07dkzDhg3TqVOnZLPZtGrVKrW0tPg7KgCDKB8Ausy6devU2tqqadOmKTk5WRkZGVq5cqUyMjKUnJysf/zHf1Rra6vWrVvn76gADOK0C4Au895770mS1q9frylTpmjq1Kmqrq7W8OHD9be//U0bNmxoNw9A3+DVkY+hQ4cqICDgsp85c+ZIkpqbmzVnzhxFR0crIiJCOTk5qqur65LgALq/oUOHSpKuvfZa7d27V2vWrFFJSYnWrFmjvXv36tprr203D0Df4FX5OHTokM6dO+f5KS0tlST96Ec/kiQtWLBAO3bs0NatW1VWVqazZ8/y7YVAH5aSkiJJ+vjjjxUdHa0NGzZo48aN2rBhg6Kjo/Xxxx+3mwegb/DqtMul/6Vc8sQTT2jYsGGaMGGCGhoaVFRUpM2bN2vSpEmSpI0bN2rEiBGqrKxUamqq71ID6BH+/shnW1ub2tra5Ha7PX++0jwAvV+Hr/loaWnR7373O+Xl5SkgIEBVVVVyOp1KT0/3zElOTlZCQoIqKiq+snw4HA45HA7Pst1ulyQ5nU45nc6OxgPQDbzxxhuSpDFjxuhPf/qTfvnLX3rWBQUF6ZZbbtHhw4f1xhtv6O677/ZXTAA+4M17dofLx/bt21VfX6+f/OQnkqTa2loFBwdrwIAB7ebZbDbV1tZ+5XYKCwtVUFBw2XhJSYnCwsI6Gg9AN/D+++9L+t//rJSUlKi2tlYxMTHKzMxUYWGhZ96uXbv8mBRAZzU1NV313A6Xj6KiIk2ePFlxcXEd3YQkKT8/X3l5eZ5lu92u+Ph4ZWZmcmM5oIc7efKk9uzZo6NHj+r555/XwoULVVdXJ5vNphUrVujo0aOSpPT0dN1xxx1+TgugMy6dubgaHSofp06d0n//93+ruLjYMxYTE6OWlhbV19e3O/pRV1enmJiYr9yW1WqV1Wq9bNxischisXQkHoBuYt68eVq8eLHCw8N17Ngxz/VgkjRkyBBFRUWpsbFR8+bNY38Hejhv9uEOfcnYxo0bNWjQIE2ZMsUzNnr0aFksFu3bt88zVl1drdOnTystLa0jLwOghwsODtaCBQvU0NAgh8Oh3Nxc3XfffcrNzVVzc7MaGhq0YMECbjAH9DFeH/loa2vTxo0bNWvWLAUF/e/To6KiNHv2bOXl5WngwIGKjIzUvHnzlJaWxiddgD7s0n1bVq1apdWrV3vGg4KC9MADD3BfF6APCnC73W5vnlBSUqKsrCxVV1fr+uuvb7euublZCxcu1JYtW+RwOJSVlaV169Z97WmXL7Pb7YqKilJDQwPXfAC9SEtLi9asWaNXX31VkyZN0rx58zjiAfQi3rx/e10+uhrlA+i9nE6ndu3apTvuuINrPIBexpv3b24sBwAAjKJ8AAAAoygfAADAKMoHAAAwivIBAACMonwAMMLlcqmsrEz79+9XWVmZXC6XvyMB8BPKB4AuV1xcrKSkJGVkZGjlypXKyMhQUlJSu1s0AOg7KB8AulRxcbGmT5+ulJQUlZeXa8uWLSovL1dKSoqmT59OAQH6IL5kDECXcblcSkpKUkpKirZv3y6Xy+X5krHAwEBlZ2fr+PHjOnHihAIDA/0dF0An8CVjALqF8vJyvf/++3rwwQflcDg0f/58Pfroo5o/f74cDofy8/NVU1Oj8vJyf0cFYJDXN5YDgKt17tw5SdLSpUv1yiuveMaPHDmiDRs2eO6MfWkegL6B8gGgy8TGxkqSXnnlFQUHBys3N1eJiYmqqanR6tWrPYXk0jwAfQPXfADoMhcuXFD//v0VEBCgpqYmBQYGeq75cLlcCgsLk9vt1vnz5xUREeHvuAA6gWs+AHQLixcvliS53W7NmDFDlZWVunjxoiorKzVjxgxd+r/PpXkA+gbKB4Auc+LECUnSM888o2PHjmn8+PGaOXOmxo8fr+PHj2vNmjXt5gHoGygfALrMddddJ0n64IMPdPLkSZWWliovL0+lpaU6ceKEzpw5024egL6Baz4AdJmLFy8qLCxMwcHBOn/+vAICAjzXfLjdbvXv318tLS1qampSaGiov+MC6ARv3r/5tAuAb9TU1KS//vWvHXruhAkTVFZWpoiICE2fMUNh1wzWi1u3attLL8npdGrChAl65513OpwtOTlZYWFhHX4+APM48gHgG7311lsaPXq0v2NcUVVVlW6++WZ/xwD6PI58APCp5ORkVVVVdWobFy9e1KOPL1fZn6o14bvD9eiDi3xyqiU5ObnT2wBgFkc+ABhz5NSnyl5fqe2/SNWoIdH+jgPAh/ieDwAA0G1RPgAAgFGUDwAAYBTlAwAAGEX5AAAARlE+AACAUZQPAABgFOUDAAAYRfkAAABGUT4AAIBRXpePDz/8UD/+8Y8VHR2t0NBQpaSk6PDhw571brdbS5YsUWxsrEJDQ5Wenq4TJ074NDQAAOi5vCofn3/+uW699VZZLBbt3r1bb7/9tlasWKFvfetbnjnLly/X008/rQ0bNujgwYMKDw9XVlaWmpubfR4eAAD0PF7d1fbJJ59UfHy8Nm7c6BlLTEz0/Nntdmv16tV66KGHdOedd0qSnnvuOdlsNm3fvl133323j2IDAICeyqvy8Yc//EFZWVn60Y9+pLKyMn3729/WL3/5S/3zP/+zJKmmpka1tbVKT0/3PCcqKkpjx45VRUXFFcuHw+GQw+HwLNvtdkmS0+mU0+ns0F8KQPfU2trqeWT/BnoXb/Zpr8rH3/72N61fv155eXl68MEHdejQIc2fP1/BwcGaNWuWamtrJUk2m63d82w2m2fdlxUWFqqgoOCy8ZKSEoWFhXkTD0A3d+aCJAWpsrJSHx73dxoAvtTU1HTVc70qH21tbbrlllv0+OOPS5K++93v6vjx49qwYYNmzZrlXcr/kZ+fr7y8PM+y3W5XfHy8MjMzFRkZ2aFtAuiejp7+TDp2WKmpqbopYaC/4wDwoUtnLq6GV+UjNjZW3/nOd9qNjRgxQv/5n/8pSYqJiZEk1dXVKTY21jOnrq5Oo0aNuuI2rVarrFbrZeMWi0UWi8WbeAC6uaCgIM8j+zfQu3izT3v1aZdbb71V1dXV7cbeffddDRkyRNIXF5/GxMRo3759nvV2u10HDx5UWlqaNy8FAAB6Ka+OfCxYsEDf//739fjjj2vGjBl688039eyzz+rZZ5+VJAUEBCg3N1dLly7Vddddp8TERD388MOKi4tTdnZ2V+QHAAA9jFflY8yYMXr55ZeVn5+vxx57TImJiVq9erXuuecez5xFixapsbFR9913n+rr63Xbbbdpz549CgkJ8Xl4AADQ8wS43W63v0P8PbvdrqioKDU0NHDBKdDLHDn1qbLXV2r7L1I1aki0v+MA8CFv3r+5twsAADCK8gEAAIyifAAAAKMoHwAAwCjKBwAAMIryAQAAjKJ8AAAAoygfAADAKMoHAAAwivIBAACMonwAAACjKB8AAMAoygcAADCK8gEAAIyifAAAAKMoHwAAwCjKBwAAMIryAQAAjKJ8AAAAoygfAADAKMoHAAAwivIBAACMonwAAACjKB8AAMAoygcAADCK8gEAAIyifAAAAKMoHwAAwCjKBwAAMIryAQAAjKJ8AAAAo7wqH48++qgCAgLa/SQnJ3vWNzc3a86cOYqOjlZERIRycnJUV1fn89AAAKDn8vrIxw033KBz5855fg4cOOBZt2DBAu3YsUNbt25VWVmZzp49q2nTpvk0MAAA6NmCvH5CUJBiYmIuG29oaFBRUZE2b96sSZMmSZI2btyoESNGqLKyUqmpqZ1PCwAAejyvy8eJEycUFxenkJAQpaWlqbCwUAkJCaqqqpLT6VR6erpnbnJyshISElRRUfGV5cPhcMjhcHiW7Xa7JMnpdMrpdHobD0A31tra6nlk/wZ6F2/2aa/Kx9ixY7Vp0yYNHz5c586dU0FBgcaNG6fjx4+rtrZWwcHBGjBgQLvn2Gw21dbWfuU2CwsLVVBQcNl4SUmJwsLCvIkHoJs7c0GSglRZWakPj/s7DQBfampquuq5XpWPyZMne/48cuRIjR07VkOGDNFLL72k0NBQbzblkZ+fr7y8PM+y3W5XfHy8MjMzFRkZ2aFtAuiejp7+TDp2WKmpqbopYaC/4wDwoUtnLq6G16dd/t6AAQN0/fXX6+TJk8rIyFBLS4vq6+vbHf2oq6u74jUil1itVlmt1svGLRaLLBZLZ+IB6GaCgoI8j+zfQO/izT7dqe/5uHDhgt577z3FxsZq9OjRslgs2rdvn2d9dXW1Tp8+rbS0tM68DAAA6EW8OvJx//33a+rUqRoyZIjOnj2rRx55RIGBgZo5c6aioqI0e/Zs5eXlaeDAgYqMjNS8efOUlpbGJ10AAICHV+Xjgw8+0MyZM/Xpp5/q2muv1W233abKykpde+21kqRVq1apX79+ysnJkcPhUFZWltatW9clwQEAQM/kVfl44YUXvnZ9SEiI1q5dq7Vr13YqFAAA6L24twsAADCK8gEAAIyifAAAAKMoHwAAwCjKBwAAMIryAQAAjKJ8AAAAoygfAADAKMoHAAAwivIBAACMonwAAACjKB8AAMAoygcAADCK8gEAAIyifAAAAKMoHwAAwCjKBwAAMIryAQAAjKJ8AAAAoygfAADAKMoHAAAwivIBAACMonwAAACjKB8AAMAoygcAADCK8gEAAIyifAAAAKMoHwAAwCjKBwAAMIryAQAAjKJ8AAAAozpVPp544gkFBAQoNzfXM9bc3Kw5c+YoOjpaERERysnJUV1dXWdzAgCAXqLD5ePQoUP67W9/q5EjR7YbX7BggXbs2KGtW7eqrKxMZ8+e1bRp0zodFAAA9A4dKh8XLlzQPffco3/7t3/Tt771Lc94Q0ODioqKtHLlSk2aNEmjR4/Wxo0b9cYbb6iystJnoQEAQM8V1JEnzZkzR1OmTFF6erqWLl3qGa+qqpLT6VR6erpnLDk5WQkJCaqoqFBqaupl23I4HHI4HJ5lu90uSXI6nXI6nR2JB6Cbam1t9TyyfwO9izf7tNfl44UXXtBbb72lQ4cOXbautrZWwcHBGjBgQLtxm82m2traK26vsLBQBQUFl42XlJQoLCzM23gAurEzFyQpSJWVlfrwuL/TAPClpqamq57rVfk4c+aMfvWrX6m0tFQhISFeB7uS/Px85eXleZbtdrvi4+OVmZmpyMhIn7wGgO7h6OnPpGOHlZqaqpsSBvo7DgAfunTm4mp4VT6qqqr00Ucf6eabb/aMuVwu7d+/X88884z27t2rlpYW1dfXtzv6UVdXp5iYmCtu02q1ymq1XjZusVhksVi8iQegmwsKCvI8sn8DvYs3+7RX5eMHP/iBjh071m7spz/9qZKTk/XrX/9a8fHxslgs2rdvn3JyciRJ1dXVOn36tNLS0rx5KQAA0Et5VT769++vG2+8sd1YeHi4oqOjPeOzZ89WXl6eBg4cqMjISM2bN09paWlXvNgUAAD0PR36tMvXWbVqlfr166ecnBw5HA5lZWVp3bp1vn4ZAADQQ3W6fLz++uvtlkNCQrR27VqtXbu2s5sGAAC9EPd2AQAARvn8tAuA7qPmk0Y1Olr9HcPjvY8bPY+XPvnSXYRbg5R4Tbi/YwB9Qvfa+wH4TM0njZr4m9f9HeOKFm479s2T/OC1+2+ngAAGUD6AXurSEY/Vd41S0qAIP6f5QuNFh3a+XqEf3p6m8NDLv9/HX05+dEG5Lx7pVkeJgN6M8gH0ckmDInTjt6P8HUPSF/d+qL1WunnIt/iSMaAP44JTAABgFOUDAAAYRfkAAABGUT4AAIBRlA8AAGAU5QMAABhF+QAAAEZRPgAAgFGUDwAAYBTlAwAAGEX5AAAARlE+AACAUZQPAABgFOUDAAAYRfkAAABGUT4AAIBRlA8AAGAU5QMAABhF+QAAAEZRPgAAgFGUDwAAYBTlAwAAGEX5AAAARlE+AACAUZQPAABgFOUDAAAY5VX5WL9+vUaOHKnIyEhFRkYqLS1Nu3fv9qxvbm7WnDlzFB0drYiICOXk5Kiurs7noQEAQM/lVfkYPHiwnnjiCVVVVenw4cOaNGmS7rzzTv3lL3+RJC1YsEA7duzQ1q1bVVZWprNnz2ratGldEhwAAPRMQd5Mnjp1arvlZcuWaf369aqsrNTgwYNVVFSkzZs3a9KkSZKkjRs3asSIEaqsrFRqaqrvUgMAgB6rw9d8uFwuvfDCC2psbFRaWpqqqqrkdDqVnp7umZOcnKyEhARVVFT4JCwAAOj5vDryIUnHjh1TWlqampubFRERoZdfflnf+c53dOTIEQUHB2vAgAHt5ttsNtXW1n7l9hwOhxwOh2fZbrdLkpxOp5xOp7fxAPyP1tZWz2N32Zcu5egueS7pjv9WQE/jzb7jdfkYPny4jhw5ooaGBm3btk2zZs1SWVmZt5vxKCwsVEFBwWXjJSUlCgsL6/B2gb7uzAVJCtKBAwd0KsLfadorLS31d4R2uvO/FdBTNDU1XfXcALfb7e7Mi6Wnp2vYsGG666679IMf/ECff/55u6MfQ4YMUW5urhYsWHDF51/pyEd8fLw++eQTRUZGdiYa0Kf95axd2esrtf0XqbohrnvsS06nU6WlpcrIyJDFYvF3HI/u+G8F9DR2u13XXHONGhoavvH92+sjH1/W1tYmh8Oh0aNHy2KxaN++fcrJyZEkVVdX6/Tp00pLS/vK51utVlmt1svGLRZLt/rlBPQ0QUFBnsfuti91t/27O/9bAT2FN/uOV+UjPz9fkydPVkJCgs6fP6/Nmzfr9ddf1969exUVFaXZs2crLy9PAwcOVGRkpObNm6e0tDQ+6QIAADy8Kh8fffSR/umf/knnzp1TVFSURo4cqb179yojI0OStGrVKvXr1085OTlyOBzKysrSunXruiQ4AADombwqH0VFRV+7PiQkRGvXrtXatWs7FQoAAPRe3NsFAAAY1ekLTgF0XwFBdtXYq9UvpHt8frS1tVVnW8/qnc/e8Vzk2R3U2C8oIMju7xhAn9F99n4APmcZcFAPvvm4v2NcZt2e7nctmGXADyTd4e8YQJ9A+QB6MWf9WK2Y8n81bFD3OfLxxwN/1K233dqtjny899EFzf/9e/6OAfQZ3WfvB+Bz7tZIJUYO13eio/wdRdIXXzJWE1SjEQNHdKvv02hrbpC79WN/xwD6DC44BQAARlE+AACAUZQPAABgFOUDAAAYRfkAAABGUT4AAIBRlA8AAGAU5QMAABhF+QAAAEZRPgAAgFGUDwAAYBTlAwAAGEX5AAAARlE+AACAUZQPAABgFOUDAAAYRfkAAABGUT4AAIBRlA8AAGAU5QMAABhF+QAAAEZRPgAAgFGUDwAAYBTlAwAAGEX5AAAARlE+AACAUZQPAABglFflo7CwUGPGjFH//v01aNAgZWdnq7q6ut2c5uZmzZkzR9HR0YqIiFBOTo7q6up8GhoAAPRcXpWPsrIyzZkzR5WVlSotLZXT6VRmZqYaGxs9cxYsWKAdO3Zo69atKisr09mzZzVt2jSfBwcAAD1TkDeT9+zZ025506ZNGjRokKqqqjR+/Hg1NDSoqKhImzdv1qRJkyRJGzdu1IgRI1RZWanU1FTfJQfwtS46XZKk4x82+DnJ/2q86NDhj6WYU58rPNTq7zgeJz+64O8IQJ/iVfn4soaGL36pDRw4UJJUVVUlp9Op9PR0z5zk5GQlJCSooqLiiuXD4XDI4XB4lu12uyTJ6XTK6XR2Jh7Qp7177ov9c3HxMT8n+bIgPX/ykL9DXJE10M3vHaCDvNl3Olw+2tralJubq1tvvVU33nijJKm2tlbBwcEaMGBAu7k2m021tbVX3E5hYaEKCgouGy8pKVFYWFhH4wFwSnf/nwANCnUruJtcWl53UXr+ZJD+X1KrbKH+TtOeNVB6+2CZ3vZ3EKCHampquuq5HS4fc+bM0fHjx3XgwIGObkKSlJ+fr7y8PM+y3W5XfHy8MjMzFRkZ2altA33dDH8H+JKjpz/T8ycP686JqbopYaC/4wDwoUtnLq5Gh8rH3LlztXPnTu3fv1+DBw/2jMfExKilpUX19fXtjn7U1dUpJibmituyWq2yWi8/92uxWGSxWDoSD0A3FRQU5Hlk/wZ6F2/2aa8Oxrrdbs2dO1cvv/yyXn31VSUmJrZbP3r0aFksFu3bt88zVl1drdOnTystLc2blwIAAL2UV0c+5syZo82bN+u//uu/1L9/f891HFFRUQoNDVVUVJRmz56tvLw8DRw4UJGRkZo3b57S0tL4pAsAAJDkZflYv369JOn2229vN75x40b95Cc/kSStWrVK/fr1U05OjhwOh7KysrRu3TqfhAUAAD2fV+XD7XZ/45yQkBCtXbtWa9eu7XAoAADQe3WTD+ABAIC+gvIBAACMonwAAACjKB8AAMAoygcAADCK8gEAAIyifAAAAKMoHwAAwCjKBwAAMIryAQAAjKJ8AAAAoygfAADAKMoHAAAwivIBAACMonwAAACjKB8AAMAoygcAADCK8gEAAIyifAAAAKMoHwAAwCjKBwAAMIryAQAAjKJ8AAAAoygfAADAKMoHAAAwivIBAACMonwAAACjKB8AAMAoygcAADCK8gEAAIyifAAAAKO8Lh/79+/X1KlTFRcXp4CAAG3fvr3derfbrSVLlig2NlahoaFKT0/XiRMnfJUXAAD0cF6Xj8bGRt10001au3btFdcvX75cTz/9tDZs2KCDBw8qPDxcWVlZam5u7nRYAADQ8wV5+4TJkydr8uTJV1zndru1evVqPfTQQ7rzzjslSc8995xsNpu2b9+uu+++u3NpAQBAj+d1+fg6NTU1qq2tVXp6umcsKipKY8eOVUVFxRXLh8PhkMPh8Czb7XZJktPplNPp9GU8AH7W2trqeWT/BnoXb/Zpn5aP2tpaSZLNZms3brPZPOu+rLCwUAUFBZeNl5SUKCwszJfxAPjZmQuSFKTKykp9eNzfaQD4UlNT01XP9Wn56Ij8/Hzl5eV5lu12u+Lj45WZmanIyEg/JgPga0dPfyYdO6zU1FTdlDDQ33EA+NClMxdXw6flIyYmRpJUV1en2NhYz3hdXZ1GjRp1xedYrVZZrdbLxi0WiywWiy/jAfCzoKAgzyP7N9C7eLNP+/R7PhITExUTE6N9+/Z5xux2uw4ePKi0tDRfvhQAAOihvD7yceHCBZ08edKzXFNToyNHjmjgwIFKSEhQbm6uli5dquuuu06JiYl6+OGHFRcXp+zsbF/mBgAAPZTX5ePw4cOaOHGiZ/nS9RqzZs3Spk2btGjRIjU2Nuq+++5TfX29brvtNu3Zs0chISG+Sw0AAHosr8vH7bffLrfb/ZXrAwIC9Nhjj+mxxx7rVDAAANA7cW8XAABgFOUDAAAYRfkAAABGUT4AAIBRlA8AAGAU5QMAABhF+QAAAEZRPgAAgFGUDwAAYBTlAwAAGEX5AAAARlE+AACAUZQPAABgFOUDAAAYRfkAAABGUT4AAIBRlA8AAGAU5QMAABhF+QAAAEZRPgAAgFGUDwAAYBTlAwAAGEX5AAAARlE+AACAUZQPAABgFOUDAAAYRfkAAABGUT4AAIBRlA8AAGAU5QMAABhF+QAAAEZ1WflYu3athg4dqpCQEI0dO1ZvvvlmV70UAADoQYK6YqMvvvii8vLytGHDBo0dO1arV69WVlaWqqurNWjQoK54SQBdqKmpSX/96187vZ3qc/Vy1J7UO8dD1fbpgM4Hk5ScnKywsDCfbAuAGQFut9vt642OHTtWY8aM0TPPPCNJamtrU3x8vObNm6fFixd/7XPtdruioqLU0NCgyMhIX0cD0AFvvfWWRo8e7e8YV1RVVaWbb77Z3zGAPs+b92+fH/loaWlRVVWV8vPzPWP9+vVTenq6KioqLpvvcDjkcDg8y3a7XZLkdDrldDp9HQ9ABwwbNkwHDx7s9HYuXHRob/khZY0bo4hQqw+SfZGN3xWA/3mzH/q8fHzyySdyuVyy2Wztxm022xUP2xYWFqqgoOCy8ZKSEg6lAr3Q91OG6Xz9Zzpf75vtnTt3zjcbAtApTU1NVz23S6758EZ+fr7y8vI8y3a7XfHx8crMzOS0C9DLOJ1OlZaWKiMjQxaLxd9xAPjQpTMXV8Pn5eOaa65RYGCg6urq2o3X1dUpJibmsvlWq1VW6+WHXy0WC7+cgF6K/RvofbzZp33+Udvg4GCNHj1a+/bt84y1tbVp3759SktL8/XLAQCAHqZLTrvk5eVp1qxZuuWWW/S9731Pq1evVmNjo3760592xcsBAIAepEvKx1133aWPP/5YS5YsUW1trUaNGqU9e/ZcdhEqAADoe7rsgtO5c+dq7ty5XbV5AADQQ3FvFwAAYBTlAwAAGEX5AAAARlE+AACAUZQPAABgFOUDAAAYRfkAAABG+f3Gcl/mdrsleXeDGgA9g9PpVFNTk+x2O/d2AXqZS+/bl97Hv063Kx/nz5+XJMXHx/s5CQAA8Nb58+cVFRX1tXMC3FdTUQxqa2vT2bNn1b9/fwUEBPg7DgAfstvtio+P15kzZxQZGenvOAB8yO126/z584qLi1O/fl9/VUe3Kx8Aei+73a6oqCg1NDRQPoA+jAtOAQCAUZQPAABgFOUDgDFWq1WPPPKIrFarv6MA8COu+QAAAEZx5AMAABhF+QAAAEZRPgAAgFGUDwAAYBTlA4BPVFRUKDAwUFOmTPF3FADdHJ92AeAT9957ryIiIlRUVKTq6mrFxcX5OxKAboojHwA67cKFC3rxxRf1i1/8QlOmTNGmTZvarf/DH/6g6667TiEhIZo4caL+4z/+QwEBAaqvr/fMOXDggMaNG6fQ0FDFx8dr/vz5amxsNPsXAWAE5QNAp7300ktKTk7W8OHD9eMf/1j//u//7rmtdk1NjaZPn67s7GwdPXpUP//5z/Uv//Iv7Z7/3nvv6R/+4R+Uk5OjP//5z3rxxRd14MABzZ071x9/HQBdjNMuADrt1ltv1YwZM/SrX/1Kra2tio2N1datW3X77bdr8eLFeuWVV3Ts2DHP/IceekjLli3T559/rgEDBujee+9VYGCgfvvb33rmHDhwQBMmTFBjY6NCQkL88dcC0EU48gGgU6qrq/Xmm29q5syZkqSgoCDdddddKioq8qwfM2ZMu+d873vfa7d89OhRbdq0SREREZ6frKwstbW1qaamxsxfBIAxQf4OAKBnKyoqUmtra7sLTN1ut6xWq5555pmr2saFCxf085//XPPnz79sXUJCgs+yAugeKB8AOqy1tVXPPfecVqxYoczMzHbrsrOztWXLFg0fPly7du1qt+7QoUPtlm+++Wa9/fbbSkpK6vLMAPyPaz4AdNj27dt111136aOPPlJUVFS7db/+9a/16quv6qWXXtLw4cO1YMECzZ49W0eOHNHChQv1wQcfqL6+XlFRUfrzn/+s1NRU/exnP9O9996r8PBwvf322yotLb3qoycAeg6u+QDQYUVFRUpPT7+seEhSTk6ODh8+rPPnz2vbtm0qLi7WyJEjtX79es+nXaxWqyRp5MiRKisr07vvvqtx48bpu9/9rpYsWcJ3hQC9FEc+ABi3bNkybdiwQWfOnPF3FAB+wDUfALrcunXrNGbMGEVHR+uPf/yjnnrqKb7DA+jDKB8AutyJEye0dOlSffbZZ0pISNDChQuVn5/v71gA/ITTLgAAwCguOAUAAEZRPgAAgFGUDwAAYBTlAwAAGEX5AAAARlE+AACAUZQPAABgFOUDAAAYRfkAAABG/X9Yms5FnRz1tgAAAABJRU5ErkJggg==",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"titanic.boxplot(column=\"Age\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Отсечение данных для признака Возраст, значение которых больше 65 лет"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>AgeClip</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>34</th>\n",
|
||
" <td>Wheadon, Mr. Edward H</td>\n",
|
||
" <td>66.0</td>\n",
|
||
" <td>65.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>97</th>\n",
|
||
" <td>Goldschmidt, Mr. George B</td>\n",
|
||
" <td>71.0</td>\n",
|
||
" <td>65.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>117</th>\n",
|
||
" <td>Connors, Mr. Patrick</td>\n",
|
||
" <td>70.5</td>\n",
|
||
" <td>65.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>494</th>\n",
|
||
" <td>Artagaveytia, Mr. Ramon</td>\n",
|
||
" <td>71.0</td>\n",
|
||
" <td>65.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>631</th>\n",
|
||
" <td>Barkworth, Mr. Algernon Henry Wilson</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>65.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>673</th>\n",
|
||
" <td>Mitchell, Mr. Henry Michael</td>\n",
|
||
" <td>70.0</td>\n",
|
||
" <td>65.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>746</th>\n",
|
||
" <td>Crosby, Capt. Edward Gifford</td>\n",
|
||
" <td>70.0</td>\n",
|
||
" <td>65.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>852</th>\n",
|
||
" <td>Svensson, Mr. Johan</td>\n",
|
||
" <td>74.0</td>\n",
|
||
" <td>65.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Name Age AgeClip\n",
|
||
"34 Wheadon, Mr. Edward H 66.0 65.0\n",
|
||
"97 Goldschmidt, Mr. George B 71.0 65.0\n",
|
||
"117 Connors, Mr. Patrick 70.5 65.0\n",
|
||
"494 Artagaveytia, Mr. Ramon 71.0 65.0\n",
|
||
"631 Barkworth, Mr. Algernon Henry Wilson 80.0 65.0\n",
|
||
"673 Mitchell, Mr. Henry Michael 70.0 65.0\n",
|
||
"746 Crosby, Capt. Edward Gifford 70.0 65.0\n",
|
||
"852 Svensson, Mr. Johan 74.0 65.0"
|
||
]
|
||
},
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"titanic_norm = titanic.copy()\n",
|
||
"\n",
|
||
"titanic_norm[\"AgeClip\"] = titanic[\"Age\"].clip(0, 65);\n",
|
||
"\n",
|
||
"titanic_norm[titanic_norm[\"Age\"] > 65][[\"Name\", \"Age\", \"AgeClip\"]]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Винсоризация признака Возраст"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"56.0\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>AgeWinsorize</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>34</th>\n",
|
||
" <td>Wheadon, Mr. Edward H</td>\n",
|
||
" <td>66.0</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>97</th>\n",
|
||
" <td>Goldschmidt, Mr. George B</td>\n",
|
||
" <td>71.0</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>117</th>\n",
|
||
" <td>Connors, Mr. Patrick</td>\n",
|
||
" <td>70.5</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>494</th>\n",
|
||
" <td>Artagaveytia, Mr. Ramon</td>\n",
|
||
" <td>71.0</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>631</th>\n",
|
||
" <td>Barkworth, Mr. Algernon Henry Wilson</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>673</th>\n",
|
||
" <td>Mitchell, Mr. Henry Michael</td>\n",
|
||
" <td>70.0</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>746</th>\n",
|
||
" <td>Crosby, Capt. Edward Gifford</td>\n",
|
||
" <td>70.0</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>852</th>\n",
|
||
" <td>Svensson, Mr. Johan</td>\n",
|
||
" <td>74.0</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Name Age AgeWinsorize\n",
|
||
"34 Wheadon, Mr. Edward H 66.0 54.0\n",
|
||
"97 Goldschmidt, Mr. George B 71.0 54.0\n",
|
||
"117 Connors, Mr. Patrick 70.5 54.0\n",
|
||
"494 Artagaveytia, Mr. Ramon 71.0 54.0\n",
|
||
"631 Barkworth, Mr. Algernon Henry Wilson 80.0 54.0\n",
|
||
"673 Mitchell, Mr. Henry Michael 70.0 54.0\n",
|
||
"746 Crosby, Capt. Edward Gifford 70.0 54.0\n",
|
||
"852 Svensson, Mr. Johan 74.0 54.0"
|
||
]
|
||
},
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from scipy.stats.mstats import winsorize\n",
|
||
"\n",
|
||
"print(titanic_norm[\"Age\"].quantile(q=0.95))\n",
|
||
"\n",
|
||
"titanic_norm[\"AgeWinsorize\"] = winsorize(\n",
|
||
" titanic_norm[\"Age\"].fillna(titanic_norm[\"Age\"].mean()), (0, 0.05), inplace=False\n",
|
||
")\n",
|
||
"\n",
|
||
"titanic_norm[titanic_norm[\"Age\"] > 65][[\"Name\", \"Age\", \"AgeWinsorize\"]]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Нормализация значений"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>AgeNorm</th>\n",
|
||
" <th>AgeClipNorm</th>\n",
|
||
" <th>AgeWinsorizeNorm</th>\n",
|
||
" <th>AgeWinsorizeNorm2</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>Braund, Mr. Owen Harris</td>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>0.271174</td>\n",
|
||
" <td>0.334159</td>\n",
|
||
" <td>0.402762</td>\n",
|
||
" <td>-0.194476</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>0.472229</td>\n",
|
||
" <td>0.581914</td>\n",
|
||
" <td>0.701381</td>\n",
|
||
" <td>0.402762</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>Heikkinen, Miss. Laina</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>0.321438</td>\n",
|
||
" <td>0.396098</td>\n",
|
||
" <td>0.477417</td>\n",
|
||
" <td>-0.045166</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>0.434531</td>\n",
|
||
" <td>0.535460</td>\n",
|
||
" <td>0.645390</td>\n",
|
||
" <td>0.290780</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Allen, Mr. William Henry</td>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>0.434531</td>\n",
|
||
" <td>0.535460</td>\n",
|
||
" <td>0.645390</td>\n",
|
||
" <td>0.290780</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>Moran, Mr. James</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.546456</td>\n",
|
||
" <td>0.092912</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>McCarthy, Mr. Timothy J</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>0.673285</td>\n",
|
||
" <td>0.829669</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>Palsson, Master. Gosta Leonard</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0.019854</td>\n",
|
||
" <td>0.024466</td>\n",
|
||
" <td>0.029489</td>\n",
|
||
" <td>-0.941023</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)</td>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>0.334004</td>\n",
|
||
" <td>0.411583</td>\n",
|
||
" <td>0.496081</td>\n",
|
||
" <td>-0.007839</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>Nasser, Mrs. Nicholas (Adele Achem)</td>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>0.170646</td>\n",
|
||
" <td>0.210282</td>\n",
|
||
" <td>0.253453</td>\n",
|
||
" <td>-0.493094</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>Sandstrom, Miss. Marguerite Rut</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>0.044986</td>\n",
|
||
" <td>0.055435</td>\n",
|
||
" <td>0.066816</td>\n",
|
||
" <td>-0.866368</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>Bonnell, Miss. Elizabeth</td>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>0.723549</td>\n",
|
||
" <td>0.891607</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>Saundercock, Mr. William Henry</td>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>0.246042</td>\n",
|
||
" <td>0.303190</td>\n",
|
||
" <td>0.365435</td>\n",
|
||
" <td>-0.269130</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>Andersson, Mr. Anders Johan</td>\n",
|
||
" <td>39.0</td>\n",
|
||
" <td>0.484795</td>\n",
|
||
" <td>0.597399</td>\n",
|
||
" <td>0.720045</td>\n",
|
||
" <td>0.440090</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>Vestrom, Miss. Hulda Amanda Adolfina</td>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>0.170646</td>\n",
|
||
" <td>0.210282</td>\n",
|
||
" <td>0.253453</td>\n",
|
||
" <td>-0.493094</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>Hewlett, Mrs. (Mary D Kingcome)</td>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>0.685851</td>\n",
|
||
" <td>0.845153</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>Rice, Master. Eugene</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0.019854</td>\n",
|
||
" <td>0.024466</td>\n",
|
||
" <td>0.029489</td>\n",
|
||
" <td>-0.941023</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>Williams, Mr. Charles Eugene</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.546456</td>\n",
|
||
" <td>0.092912</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>Vander Planke, Mrs. Julius (Emelia Maria Vande...</td>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>0.384267</td>\n",
|
||
" <td>0.473521</td>\n",
|
||
" <td>0.570735</td>\n",
|
||
" <td>0.141471</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>Masselmani, Mrs. Fatima</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.546456</td>\n",
|
||
" <td>0.092912</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Name Age AgeNorm \\\n",
|
||
"1 Braund, Mr. Owen Harris 22.0 0.271174 \n",
|
||
"2 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 0.472229 \n",
|
||
"3 Heikkinen, Miss. Laina 26.0 0.321438 \n",
|
||
"4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 0.434531 \n",
|
||
"5 Allen, Mr. William Henry 35.0 0.434531 \n",
|
||
"6 Moran, Mr. James NaN NaN \n",
|
||
"7 McCarthy, Mr. Timothy J 54.0 0.673285 \n",
|
||
"8 Palsson, Master. Gosta Leonard 2.0 0.019854 \n",
|
||
"9 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 27.0 0.334004 \n",
|
||
"10 Nasser, Mrs. Nicholas (Adele Achem) 14.0 0.170646 \n",
|
||
"11 Sandstrom, Miss. Marguerite Rut 4.0 0.044986 \n",
|
||
"12 Bonnell, Miss. Elizabeth 58.0 0.723549 \n",
|
||
"13 Saundercock, Mr. William Henry 20.0 0.246042 \n",
|
||
"14 Andersson, Mr. Anders Johan 39.0 0.484795 \n",
|
||
"15 Vestrom, Miss. Hulda Amanda Adolfina 14.0 0.170646 \n",
|
||
"16 Hewlett, Mrs. (Mary D Kingcome) 55.0 0.685851 \n",
|
||
"17 Rice, Master. Eugene 2.0 0.019854 \n",
|
||
"18 Williams, Mr. Charles Eugene NaN NaN \n",
|
||
"19 Vander Planke, Mrs. Julius (Emelia Maria Vande... 31.0 0.384267 \n",
|
||
"20 Masselmani, Mrs. Fatima NaN NaN \n",
|
||
"\n",
|
||
" AgeClipNorm AgeWinsorizeNorm AgeWinsorizeNorm2 \n",
|
||
"1 0.334159 0.402762 -0.194476 \n",
|
||
"2 0.581914 0.701381 0.402762 \n",
|
||
"3 0.396098 0.477417 -0.045166 \n",
|
||
"4 0.535460 0.645390 0.290780 \n",
|
||
"5 0.535460 0.645390 0.290780 \n",
|
||
"6 NaN 0.546456 0.092912 \n",
|
||
"7 0.829669 1.000000 1.000000 \n",
|
||
"8 0.024466 0.029489 -0.941023 \n",
|
||
"9 0.411583 0.496081 -0.007839 \n",
|
||
"10 0.210282 0.253453 -0.493094 \n",
|
||
"11 0.055435 0.066816 -0.866368 \n",
|
||
"12 0.891607 1.000000 1.000000 \n",
|
||
"13 0.303190 0.365435 -0.269130 \n",
|
||
"14 0.597399 0.720045 0.440090 \n",
|
||
"15 0.210282 0.253453 -0.493094 \n",
|
||
"16 0.845153 1.000000 1.000000 \n",
|
||
"17 0.024466 0.029489 -0.941023 \n",
|
||
"18 NaN 0.546456 0.092912 \n",
|
||
"19 0.473521 0.570735 0.141471 \n",
|
||
"20 NaN 0.546456 0.092912 "
|
||
]
|
||
},
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn import preprocessing\n",
|
||
"\n",
|
||
"min_max_scaler = preprocessing.MinMaxScaler()\n",
|
||
"\n",
|
||
"min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))\n",
|
||
"\n",
|
||
"titanic_norm[\"AgeNorm\"] = min_max_scaler.fit_transform(\n",
|
||
" titanic_norm[\"Age\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(titanic_norm[\"Age\"].shape)\n",
|
||
"\n",
|
||
"titanic_norm[\"AgeClipNorm\"] = min_max_scaler.fit_transform(\n",
|
||
" titanic_norm[\"AgeClip\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(titanic_norm[\"Age\"].shape)\n",
|
||
"\n",
|
||
"titanic_norm[\"AgeWinsorizeNorm\"] = min_max_scaler.fit_transform(\n",
|
||
" titanic_norm[\"AgeWinsorize\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(titanic_norm[\"Age\"].shape)\n",
|
||
"\n",
|
||
"titanic_norm[\"AgeWinsorizeNorm2\"] = min_max_scaler_2.fit_transform(\n",
|
||
" titanic_norm[\"AgeWinsorize\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(titanic_norm[\"Age\"].shape)\n",
|
||
"\n",
|
||
"titanic_norm[\n",
|
||
" [\"Name\", \"Age\", \"AgeNorm\", \"AgeClipNorm\", \"AgeWinsorizeNorm\", \"AgeWinsorizeNorm2\"]\n",
|
||
"].head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Стандартизация значений"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 26,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>AgeStand</th>\n",
|
||
" <th>AgeClipStand</th>\n",
|
||
" <th>AgeWinsorizeStand</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>Braund, Mr. Owen Harris</td>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>-0.530377</td>\n",
|
||
" <td>-0.532745</td>\n",
|
||
" <td>-0.606602</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>0.571831</td>\n",
|
||
" <td>0.585060</td>\n",
|
||
" <td>0.718863</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>Heikkinen, Miss. Laina</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>-0.254825</td>\n",
|
||
" <td>-0.253294</td>\n",
|
||
" <td>-0.275236</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>0.365167</td>\n",
|
||
" <td>0.375472</td>\n",
|
||
" <td>0.470339</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Allen, Mr. William Henry</td>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>0.365167</td>\n",
|
||
" <td>0.375472</td>\n",
|
||
" <td>0.470339</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>Moran, Mr. James</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.031205</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>McCarthy, Mr. Timothy J</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>1.674039</td>\n",
|
||
" <td>1.702866</td>\n",
|
||
" <td>2.044329</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>Palsson, Master. Gosta Leonard</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>-1.908136</td>\n",
|
||
" <td>-1.930003</td>\n",
|
||
" <td>-2.263435</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)</td>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>-0.185937</td>\n",
|
||
" <td>-0.183431</td>\n",
|
||
" <td>-0.192394</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>Nasser, Mrs. Nicholas (Adele Achem)</td>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>-1.081480</td>\n",
|
||
" <td>-1.091648</td>\n",
|
||
" <td>-1.269335</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>Sandstrom, Miss. Marguerite Rut</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>-1.770360</td>\n",
|
||
" <td>-1.790277</td>\n",
|
||
" <td>-2.097751</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>Bonnell, Miss. Elizabeth</td>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>1.949591</td>\n",
|
||
" <td>1.982317</td>\n",
|
||
" <td>2.044329</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>Saundercock, Mr. William Henry</td>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>-0.668153</td>\n",
|
||
" <td>-0.672471</td>\n",
|
||
" <td>-0.772286</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>Andersson, Mr. Anders Johan</td>\n",
|
||
" <td>39.0</td>\n",
|
||
" <td>0.640719</td>\n",
|
||
" <td>0.654923</td>\n",
|
||
" <td>0.801705</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>Vestrom, Miss. Hulda Amanda Adolfina</td>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>-1.081480</td>\n",
|
||
" <td>-1.091648</td>\n",
|
||
" <td>-1.269335</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>Hewlett, Mrs. (Mary D Kingcome)</td>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>1.742927</td>\n",
|
||
" <td>1.772729</td>\n",
|
||
" <td>2.044329</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>Rice, Master. Eugene</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>-1.908136</td>\n",
|
||
" <td>-1.930003</td>\n",
|
||
" <td>-2.263435</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>Williams, Mr. Charles Eugene</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.031205</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>Vander Planke, Mrs. Julius (Emelia Maria Vande...</td>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>0.089615</td>\n",
|
||
" <td>0.096020</td>\n",
|
||
" <td>0.138972</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>Masselmani, Mrs. Fatima</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.031205</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Name Age AgeStand \\\n",
|
||
"1 Braund, Mr. Owen Harris 22.0 -0.530377 \n",
|
||
"2 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 0.571831 \n",
|
||
"3 Heikkinen, Miss. Laina 26.0 -0.254825 \n",
|
||
"4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 0.365167 \n",
|
||
"5 Allen, Mr. William Henry 35.0 0.365167 \n",
|
||
"6 Moran, Mr. James NaN NaN \n",
|
||
"7 McCarthy, Mr. Timothy J 54.0 1.674039 \n",
|
||
"8 Palsson, Master. Gosta Leonard 2.0 -1.908136 \n",
|
||
"9 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 27.0 -0.185937 \n",
|
||
"10 Nasser, Mrs. Nicholas (Adele Achem) 14.0 -1.081480 \n",
|
||
"11 Sandstrom, Miss. Marguerite Rut 4.0 -1.770360 \n",
|
||
"12 Bonnell, Miss. Elizabeth 58.0 1.949591 \n",
|
||
"13 Saundercock, Mr. William Henry 20.0 -0.668153 \n",
|
||
"14 Andersson, Mr. Anders Johan 39.0 0.640719 \n",
|
||
"15 Vestrom, Miss. Hulda Amanda Adolfina 14.0 -1.081480 \n",
|
||
"16 Hewlett, Mrs. (Mary D Kingcome) 55.0 1.742927 \n",
|
||
"17 Rice, Master. Eugene 2.0 -1.908136 \n",
|
||
"18 Williams, Mr. Charles Eugene NaN NaN \n",
|
||
"19 Vander Planke, Mrs. Julius (Emelia Maria Vande... 31.0 0.089615 \n",
|
||
"20 Masselmani, Mrs. Fatima NaN NaN \n",
|
||
"\n",
|
||
" AgeClipStand AgeWinsorizeStand \n",
|
||
"1 -0.532745 -0.606602 \n",
|
||
"2 0.585060 0.718863 \n",
|
||
"3 -0.253294 -0.275236 \n",
|
||
"4 0.375472 0.470339 \n",
|
||
"5 0.375472 0.470339 \n",
|
||
"6 NaN 0.031205 \n",
|
||
"7 1.702866 2.044329 \n",
|
||
"8 -1.930003 -2.263435 \n",
|
||
"9 -0.183431 -0.192394 \n",
|
||
"10 -1.091648 -1.269335 \n",
|
||
"11 -1.790277 -2.097751 \n",
|
||
"12 1.982317 2.044329 \n",
|
||
"13 -0.672471 -0.772286 \n",
|
||
"14 0.654923 0.801705 \n",
|
||
"15 -1.091648 -1.269335 \n",
|
||
"16 1.772729 2.044329 \n",
|
||
"17 -1.930003 -2.263435 \n",
|
||
"18 NaN 0.031205 \n",
|
||
"19 0.096020 0.138972 \n",
|
||
"20 NaN 0.031205 "
|
||
]
|
||
},
|
||
"execution_count": 26,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn import preprocessing\n",
|
||
"\n",
|
||
"stndart_scaler = preprocessing.StandardScaler()\n",
|
||
"\n",
|
||
"titanic_norm[\"AgeStand\"] = stndart_scaler.fit_transform(\n",
|
||
" titanic_norm[\"Age\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(titanic_norm[\"Age\"].shape)\n",
|
||
"\n",
|
||
"titanic_norm[\"AgeClipStand\"] = stndart_scaler.fit_transform(\n",
|
||
" titanic_norm[\"AgeClip\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(titanic_norm[\"Age\"].shape)\n",
|
||
"\n",
|
||
"titanic_norm[\"AgeWinsorizeStand\"] = stndart_scaler.fit_transform(\n",
|
||
" titanic_norm[\"AgeWinsorize\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(titanic_norm[\"Age\"].shape)\n",
|
||
"\n",
|
||
"titanic_norm[[\"Name\", \"Age\", \"AgeStand\", \"AgeClipStand\", \"AgeWinsorizeStand\"]].head(20)"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": ".venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.7"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|