3956 lines
134 KiB
Plaintext
3956 lines
134 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Унитарное кодирование\n",
|
||
"\n",
|
||
"Преобразование категориального признака в несколько бинарных признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Загрузка набора данных Titanic"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Survived</th>\n",
|
||
" <th>Pclass</th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Sex</th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>SibSp</th>\n",
|
||
" <th>Parch</th>\n",
|
||
" <th>Ticket</th>\n",
|
||
" <th>Fare</th>\n",
|
||
" <th>Cabin</th>\n",
|
||
" <th>Embarked</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>PassengerId</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Braund, Mr. Owen Harris</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>A/5 21171</td>\n",
|
||
" <td>7.2500</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>PC 17599</td>\n",
|
||
" <td>71.2833</td>\n",
|
||
" <td>C85</td>\n",
|
||
" <td>C</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Heikkinen, Miss. Laina</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>STON/O2. 3101282</td>\n",
|
||
" <td>7.9250</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>113803</td>\n",
|
||
" <td>53.1000</td>\n",
|
||
" <td>C123</td>\n",
|
||
" <td>S</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Allen, Mr. William Henry</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>373450</td>\n",
|
||
" <td>8.0500</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>887</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>Montvila, Rev. Juozas</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>211536</td>\n",
|
||
" <td>13.0000</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>888</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Graham, Miss. Margaret Edith</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>19.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>112053</td>\n",
|
||
" <td>30.0000</td>\n",
|
||
" <td>B42</td>\n",
|
||
" <td>S</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>889</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>W./C. 6607</td>\n",
|
||
" <td>23.4500</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>890</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Behr, Mr. Karl Howell</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>111369</td>\n",
|
||
" <td>30.0000</td>\n",
|
||
" <td>C148</td>\n",
|
||
" <td>C</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>891</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Dooley, Mr. Patrick</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>32.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>370376</td>\n",
|
||
" <td>7.7500</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Q</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>891 rows × 11 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Survived Pclass \\\n",
|
||
"PassengerId \n",
|
||
"1 0 3 \n",
|
||
"2 1 1 \n",
|
||
"3 1 3 \n",
|
||
"4 1 1 \n",
|
||
"5 0 3 \n",
|
||
"... ... ... \n",
|
||
"887 0 2 \n",
|
||
"888 1 1 \n",
|
||
"889 0 3 \n",
|
||
"890 1 1 \n",
|
||
"891 0 3 \n",
|
||
"\n",
|
||
" Name Sex Age \\\n",
|
||
"PassengerId \n",
|
||
"1 Braund, Mr. Owen Harris male 22.0 \n",
|
||
"2 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 \n",
|
||
"3 Heikkinen, Miss. Laina female 26.0 \n",
|
||
"4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 \n",
|
||
"5 Allen, Mr. William Henry male 35.0 \n",
|
||
"... ... ... ... \n",
|
||
"887 Montvila, Rev. Juozas male 27.0 \n",
|
||
"888 Graham, Miss. Margaret Edith female 19.0 \n",
|
||
"889 Johnston, Miss. Catherine Helen \"Carrie\" female NaN \n",
|
||
"890 Behr, Mr. Karl Howell male 26.0 \n",
|
||
"891 Dooley, Mr. Patrick male 32.0 \n",
|
||
"\n",
|
||
" SibSp Parch Ticket Fare Cabin Embarked \n",
|
||
"PassengerId \n",
|
||
"1 1 0 A/5 21171 7.2500 NaN S \n",
|
||
"2 1 0 PC 17599 71.2833 C85 C \n",
|
||
"3 0 0 STON/O2. 3101282 7.9250 NaN S \n",
|
||
"4 1 0 113803 53.1000 C123 S \n",
|
||
"5 0 0 373450 8.0500 NaN S \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"887 0 0 211536 13.0000 NaN S \n",
|
||
"888 0 0 112053 30.0000 B42 S \n",
|
||
"889 1 2 W./C. 6607 23.4500 NaN S \n",
|
||
"890 0 0 111369 30.0000 C148 C \n",
|
||
"891 0 0 370376 7.7500 NaN Q \n",
|
||
"\n",
|
||
"[891 rows x 11 columns]"
|
||
]
|
||
},
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"titanic = pd.read_csv(\"data/titanic.csv\", index_col=\"PassengerId\")\n",
|
||
"\n",
|
||
"titanic"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Унитарное кодирование признаков Пол (Sex) и Порт посадки (Embarked)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Кодирование"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Embarked_Q</th>\n",
|
||
" <th>Embarked_S</th>\n",
|
||
" <th>Embarked_nan</th>\n",
|
||
" <th>Sex_male</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>886</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>887</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>888</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>889</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>890</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>891 rows × 4 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Embarked_Q Embarked_S Embarked_nan Sex_male\n",
|
||
"0 0.0 1.0 0.0 1.0\n",
|
||
"1 0.0 0.0 0.0 0.0\n",
|
||
"2 0.0 1.0 0.0 0.0\n",
|
||
"3 0.0 1.0 0.0 0.0\n",
|
||
"4 0.0 1.0 0.0 1.0\n",
|
||
".. ... ... ... ...\n",
|
||
"886 0.0 1.0 0.0 1.0\n",
|
||
"887 0.0 1.0 0.0 0.0\n",
|
||
"888 0.0 1.0 0.0 0.0\n",
|
||
"889 0.0 0.0 0.0 1.0\n",
|
||
"890 1.0 0.0 0.0 1.0\n",
|
||
"\n",
|
||
"[891 rows x 4 columns]"
|
||
]
|
||
},
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
|
||
"\n",
|
||
"encoded_values = encoder.fit_transform(titanic[[\"Embarked\", \"Sex\"]])\n",
|
||
"\n",
|
||
"encoded_columns = encoder.get_feature_names_out([\"Embarked\", \"Sex\"])\n",
|
||
"\n",
|
||
"encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
|
||
"\n",
|
||
"encoded_values_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Добавление признаков в исходный Dataframe"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Survived</th>\n",
|
||
" <th>Pclass</th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Sex</th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>SibSp</th>\n",
|
||
" <th>Parch</th>\n",
|
||
" <th>Ticket</th>\n",
|
||
" <th>Fare</th>\n",
|
||
" <th>Cabin</th>\n",
|
||
" <th>Embarked</th>\n",
|
||
" <th>Embarked_Q</th>\n",
|
||
" <th>Embarked_S</th>\n",
|
||
" <th>Embarked_nan</th>\n",
|
||
" <th>Sex_male</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>Braund, Mr. Owen Harris</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>A/5 21171</td>\n",
|
||
" <td>7.2500</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>PC 17599</td>\n",
|
||
" <td>71.2833</td>\n",
|
||
" <td>C85</td>\n",
|
||
" <td>C</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>Heikkinen, Miss. Laina</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>STON/O2. 3101282</td>\n",
|
||
" <td>7.9250</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>113803</td>\n",
|
||
" <td>53.1000</td>\n",
|
||
" <td>C123</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>Allen, Mr. William Henry</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>373450</td>\n",
|
||
" <td>8.0500</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>888</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Graham, Miss. Margaret Edith</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>19.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>112053</td>\n",
|
||
" <td>30.0000</td>\n",
|
||
" <td>B42</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>889</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>W./C. 6607</td>\n",
|
||
" <td>23.4500</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>890</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Behr, Mr. Karl Howell</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>111369</td>\n",
|
||
" <td>30.0000</td>\n",
|
||
" <td>C148</td>\n",
|
||
" <td>C</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>891</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>Dooley, Mr. Patrick</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>32.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>370376</td>\n",
|
||
" <td>7.7500</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Q</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>892 rows × 15 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Survived Pclass Name \\\n",
|
||
"1 0.0 3.0 Braund, Mr. Owen Harris \n",
|
||
"2 1.0 1.0 Cumings, Mrs. John Bradley (Florence Briggs Th... \n",
|
||
"3 1.0 3.0 Heikkinen, Miss. Laina \n",
|
||
"4 1.0 1.0 Futrelle, Mrs. Jacques Heath (Lily May Peel) \n",
|
||
"5 0.0 3.0 Allen, Mr. William Henry \n",
|
||
".. ... ... ... \n",
|
||
"888 1.0 1.0 Graham, Miss. Margaret Edith \n",
|
||
"889 0.0 3.0 Johnston, Miss. Catherine Helen \"Carrie\" \n",
|
||
"890 1.0 1.0 Behr, Mr. Karl Howell \n",
|
||
"891 0.0 3.0 Dooley, Mr. Patrick \n",
|
||
"0 NaN NaN NaN \n",
|
||
"\n",
|
||
" Sex Age SibSp Parch Ticket Fare Cabin Embarked \\\n",
|
||
"1 male 22.0 1.0 0.0 A/5 21171 7.2500 NaN S \n",
|
||
"2 female 38.0 1.0 0.0 PC 17599 71.2833 C85 C \n",
|
||
"3 female 26.0 0.0 0.0 STON/O2. 3101282 7.9250 NaN S \n",
|
||
"4 female 35.0 1.0 0.0 113803 53.1000 C123 S \n",
|
||
"5 male 35.0 0.0 0.0 373450 8.0500 NaN S \n",
|
||
".. ... ... ... ... ... ... ... ... \n",
|
||
"888 female 19.0 0.0 0.0 112053 30.0000 B42 S \n",
|
||
"889 female NaN 1.0 2.0 W./C. 6607 23.4500 NaN S \n",
|
||
"890 male 26.0 0.0 0.0 111369 30.0000 C148 C \n",
|
||
"891 male 32.0 0.0 0.0 370376 7.7500 NaN Q \n",
|
||
"0 NaN NaN NaN NaN NaN NaN NaN NaN \n",
|
||
"\n",
|
||
" Embarked_Q Embarked_S Embarked_nan Sex_male \n",
|
||
"1 0.0 0.0 0.0 0.0 \n",
|
||
"2 0.0 1.0 0.0 0.0 \n",
|
||
"3 0.0 1.0 0.0 0.0 \n",
|
||
"4 0.0 1.0 0.0 1.0 \n",
|
||
"5 1.0 0.0 0.0 1.0 \n",
|
||
".. ... ... ... ... \n",
|
||
"888 0.0 1.0 0.0 0.0 \n",
|
||
"889 0.0 0.0 0.0 1.0 \n",
|
||
"890 1.0 0.0 0.0 1.0 \n",
|
||
"891 NaN NaN NaN NaN \n",
|
||
"0 0.0 1.0 0.0 1.0 \n",
|
||
"\n",
|
||
"[892 rows x 15 columns]"
|
||
]
|
||
},
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"titanic = pd.concat([titanic, encoded_values_df], axis=1)\n",
|
||
"\n",
|
||
"titanic"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Дискретизация признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Равномерное разделение данных на 3 группы"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"labels = [\"young\", \"middle-aged\", \"old\"]\n",
|
||
"num_bins = 3"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(array([ 0.42 , 26.94666667, 53.47333333, 80. ]),\n",
|
||
" array([319, 523, 50]))"
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"hist1, bins1 = np.histogram(titanic[\"Age\"].fillna(titanic[\"Age\"].median()), bins=num_bins)\n",
|
||
"bins1, hist1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>Age</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>(0.42, 26.947]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>(26.947, 53.473]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>(0.42, 26.947]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>(26.947, 53.473]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>(26.947, 53.473]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>(53.473, 80.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>(0.42, 26.947]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>(26.947, 53.473]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>(0.42, 26.947]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>(0.42, 26.947]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>(53.473, 80.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>(0.42, 26.947]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>39.0</td>\n",
|
||
" <td>(26.947, 53.473]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>(0.42, 26.947]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>(53.473, 80.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>(0.42, 26.947]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>(26.947, 53.473]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Age Age\n",
|
||
"1 22.0 (0.42, 26.947]\n",
|
||
"2 38.0 (26.947, 53.473]\n",
|
||
"3 26.0 (0.42, 26.947]\n",
|
||
"4 35.0 (26.947, 53.473]\n",
|
||
"5 35.0 (26.947, 53.473]\n",
|
||
"6 NaN NaN\n",
|
||
"7 54.0 (53.473, 80.0]\n",
|
||
"8 2.0 (0.42, 26.947]\n",
|
||
"9 27.0 (26.947, 53.473]\n",
|
||
"10 14.0 (0.42, 26.947]\n",
|
||
"11 4.0 (0.42, 26.947]\n",
|
||
"12 58.0 (53.473, 80.0]\n",
|
||
"13 20.0 (0.42, 26.947]\n",
|
||
"14 39.0 (26.947, 53.473]\n",
|
||
"15 14.0 (0.42, 26.947]\n",
|
||
"16 55.0 (53.473, 80.0]\n",
|
||
"17 2.0 (0.42, 26.947]\n",
|
||
"18 NaN NaN\n",
|
||
"19 31.0 (26.947, 53.473]\n",
|
||
"20 NaN NaN"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins1))], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>Age</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>39.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Age Age\n",
|
||
"1 22.0 young\n",
|
||
"2 38.0 middle-aged\n",
|
||
"3 26.0 young\n",
|
||
"4 35.0 middle-aged\n",
|
||
"5 35.0 middle-aged\n",
|
||
"6 NaN NaN\n",
|
||
"7 54.0 old\n",
|
||
"8 2.0 young\n",
|
||
"9 27.0 middle-aged\n",
|
||
"10 14.0 young\n",
|
||
"11 4.0 young\n",
|
||
"12 58.0 old\n",
|
||
"13 20.0 young\n",
|
||
"14 39.0 middle-aged\n",
|
||
"15 14.0 young\n",
|
||
"16 55.0 old\n",
|
||
"17 2.0 young\n",
|
||
"18 NaN NaN\n",
|
||
"19 31.0 middle-aged\n",
|
||
"20 NaN NaN"
|
||
]
|
||
},
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins1), labels=labels)], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Равномерное разделение данных на 3 группы c установкой собственной границы диапазона значений (от 0 до 100)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(array([ 0. , 33.33333333, 66.66666667, 100. ]),\n",
|
||
" array([641, 244, 7]))"
|
||
]
|
||
},
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"bins2 = np.linspace(0, 100, 4)\n",
|
||
"tmp_bins2 = np.digitize(titanic[\"Age\"].fillna(titanic[\"Age\"].median()), bins2)\n",
|
||
"hist2 = np.bincount(tmp_bins2 - 1)\n",
|
||
"bins2, hist2"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>Age</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>(0.0, 33.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>(33.333, 66.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>(0.0, 33.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>(33.333, 66.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>(33.333, 66.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>(33.333, 66.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>(0.0, 33.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>(0.0, 33.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>(0.0, 33.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>(0.0, 33.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>(33.333, 66.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>(0.0, 33.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>39.0</td>\n",
|
||
" <td>(33.333, 66.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>(0.0, 33.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>(33.333, 66.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>(0.0, 33.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>(0.0, 33.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Age Age\n",
|
||
"1 22.0 (0.0, 33.333]\n",
|
||
"2 38.0 (33.333, 66.667]\n",
|
||
"3 26.0 (0.0, 33.333]\n",
|
||
"4 35.0 (33.333, 66.667]\n",
|
||
"5 35.0 (33.333, 66.667]\n",
|
||
"6 NaN NaN\n",
|
||
"7 54.0 (33.333, 66.667]\n",
|
||
"8 2.0 (0.0, 33.333]\n",
|
||
"9 27.0 (0.0, 33.333]\n",
|
||
"10 14.0 (0.0, 33.333]\n",
|
||
"11 4.0 (0.0, 33.333]\n",
|
||
"12 58.0 (33.333, 66.667]\n",
|
||
"13 20.0 (0.0, 33.333]\n",
|
||
"14 39.0 (33.333, 66.667]\n",
|
||
"15 14.0 (0.0, 33.333]\n",
|
||
"16 55.0 (33.333, 66.667]\n",
|
||
"17 2.0 (0.0, 33.333]\n",
|
||
"18 NaN NaN\n",
|
||
"19 31.0 (0.0, 33.333]\n",
|
||
"20 NaN NaN"
|
||
]
|
||
},
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins2))], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>Age</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>39.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Age Age\n",
|
||
"1 22.0 young\n",
|
||
"2 38.0 middle-aged\n",
|
||
"3 26.0 young\n",
|
||
"4 35.0 middle-aged\n",
|
||
"5 35.0 middle-aged\n",
|
||
"6 NaN NaN\n",
|
||
"7 54.0 middle-aged\n",
|
||
"8 2.0 young\n",
|
||
"9 27.0 young\n",
|
||
"10 14.0 young\n",
|
||
"11 4.0 young\n",
|
||
"12 58.0 middle-aged\n",
|
||
"13 20.0 young\n",
|
||
"14 39.0 middle-aged\n",
|
||
"15 14.0 young\n",
|
||
"16 55.0 middle-aged\n",
|
||
"17 2.0 young\n",
|
||
"18 NaN NaN\n",
|
||
"19 31.0 young\n",
|
||
"20 NaN NaN"
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins2), labels=labels)], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Равномерное разделение данных на 3 группы c установкой собственных интервалов (0 - 39, 40 - 60, 61 - 100)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(array([ 0, 40, 60, 100]), array([729, 137, 26]))"
|
||
]
|
||
},
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"hist3, bins3 = np.histogram(\n",
|
||
" titanic[\"Age\"].fillna(titanic[\"Age\"].median()), bins=[0, 40, 60, 100]\n",
|
||
")\n",
|
||
"bins3, hist3"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>Age</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>(40.0, 60.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>(40.0, 60.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>39.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>(40.0, 60.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>(0.0, 40.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Age Age\n",
|
||
"1 22.0 (0.0, 40.0]\n",
|
||
"2 38.0 (0.0, 40.0]\n",
|
||
"3 26.0 (0.0, 40.0]\n",
|
||
"4 35.0 (0.0, 40.0]\n",
|
||
"5 35.0 (0.0, 40.0]\n",
|
||
"6 NaN NaN\n",
|
||
"7 54.0 (40.0, 60.0]\n",
|
||
"8 2.0 (0.0, 40.0]\n",
|
||
"9 27.0 (0.0, 40.0]\n",
|
||
"10 14.0 (0.0, 40.0]\n",
|
||
"11 4.0 (0.0, 40.0]\n",
|
||
"12 58.0 (40.0, 60.0]\n",
|
||
"13 20.0 (0.0, 40.0]\n",
|
||
"14 39.0 (0.0, 40.0]\n",
|
||
"15 14.0 (0.0, 40.0]\n",
|
||
"16 55.0 (40.0, 60.0]\n",
|
||
"17 2.0 (0.0, 40.0]\n",
|
||
"18 NaN NaN\n",
|
||
"19 31.0 (0.0, 40.0]\n",
|
||
"20 NaN NaN"
|
||
]
|
||
},
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins3))], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>Age</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>39.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Age Age\n",
|
||
"1 22.0 young\n",
|
||
"2 38.0 young\n",
|
||
"3 26.0 young\n",
|
||
"4 35.0 young\n",
|
||
"5 35.0 young\n",
|
||
"6 NaN NaN\n",
|
||
"7 54.0 middle-aged\n",
|
||
"8 2.0 young\n",
|
||
"9 27.0 young\n",
|
||
"10 14.0 young\n",
|
||
"11 4.0 young\n",
|
||
"12 58.0 middle-aged\n",
|
||
"13 20.0 young\n",
|
||
"14 39.0 young\n",
|
||
"15 14.0 young\n",
|
||
"16 55.0 middle-aged\n",
|
||
"17 2.0 young\n",
|
||
"18 NaN NaN\n",
|
||
"19 31.0 young\n",
|
||
"20 NaN NaN"
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins3), labels=labels)], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Квантильное разделение данных на 3 группы"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>Age</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>39.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Age Age\n",
|
||
"1 22.0 0.0\n",
|
||
"2 38.0 2.0\n",
|
||
"3 26.0 1.0\n",
|
||
"4 35.0 2.0\n",
|
||
"5 35.0 2.0\n",
|
||
"6 NaN NaN\n",
|
||
"7 54.0 2.0\n",
|
||
"8 2.0 0.0\n",
|
||
"9 27.0 1.0\n",
|
||
"10 14.0 0.0\n",
|
||
"11 4.0 0.0\n",
|
||
"12 58.0 2.0\n",
|
||
"13 20.0 0.0\n",
|
||
"14 39.0 2.0\n",
|
||
"15 14.0 0.0\n",
|
||
"16 55.0 2.0\n",
|
||
"17 2.0 0.0\n",
|
||
"18 NaN NaN\n",
|
||
"19 31.0 1.0\n",
|
||
"20 NaN NaN"
|
||
]
|
||
},
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([titanic[\"Age\"], pd.qcut(titanic[\"Age\"], q=3, labels=False)], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>Age</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>39.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>old</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>middle-aged</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Age Age\n",
|
||
"1 22.0 young\n",
|
||
"2 38.0 old\n",
|
||
"3 26.0 middle-aged\n",
|
||
"4 35.0 old\n",
|
||
"5 35.0 old\n",
|
||
"6 NaN NaN\n",
|
||
"7 54.0 old\n",
|
||
"8 2.0 young\n",
|
||
"9 27.0 middle-aged\n",
|
||
"10 14.0 young\n",
|
||
"11 4.0 young\n",
|
||
"12 58.0 old\n",
|
||
"13 20.0 young\n",
|
||
"14 39.0 old\n",
|
||
"15 14.0 young\n",
|
||
"16 55.0 old\n",
|
||
"17 2.0 young\n",
|
||
"18 NaN NaN\n",
|
||
"19 31.0 middle-aged\n",
|
||
"20 NaN NaN"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([titanic[\"Age\"], pd.qcut(titanic[\"Age\"], q=3, labels=labels)], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Пример использования библиотеки Featuretools для автоматического конструирования (синтеза) признаков\n",
|
||
"\n",
|
||
"https://featuretools.alteryx.com/en/stable/getting_started/using_entitysets.html"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Загрузка данных\n",
|
||
"\n",
|
||
"За основу был взят набор данных \"Ecommerce Orders Data Set\" из Kaggle\n",
|
||
"\n",
|
||
"Используется только 100 первых заказов и связанные с ними объекты\n",
|
||
"\n",
|
||
"https://www.kaggle.com/datasets/sangamsharmait/ecommerce-orders-data-analysis"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import featuretools as ft\n",
|
||
"from woodwork.logical_types import Categorical, Datetime\n",
|
||
"\n",
|
||
"customers = pd.read_csv(\"data/orders/customers.csv\")\n",
|
||
"sellers = pd.read_csv(\"data/orders/sellers.csv\")\n",
|
||
"products = pd.read_csv(\"data/orders/products.csv\")\n",
|
||
"orders = pd.read_csv(\"data/orders/orders.csv\")\n",
|
||
"orders.fillna({\"order_delivered_carrier_date\": pd.to_datetime(\n",
|
||
" \"1900-01-01 00:00:00\"\n",
|
||
")}, inplace=True)\n",
|
||
"orders.fillna(\n",
|
||
" {\"order_delivered_customer_date\": pd.to_datetime(\"1900-01-01 00:00:00\")},\n",
|
||
" inplace=True,\n",
|
||
")\n",
|
||
"order_items = pd.read_csv(\"data/orders/order_items.csv\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Создание сущностей в featuretools\n",
|
||
"\n",
|
||
"Добавление dataframe'ов с данными в EntitySet с указанием параметров: название сущности (таблицы), первичный ключ, категориальные атрибуты (в том числе даты)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Entityset: orders\n",
|
||
" DataFrames:\n",
|
||
" customers [Rows: 100, Columns: 5]\n",
|
||
" sellers [Rows: 87, Columns: 4]\n",
|
||
" products [Rows: 100, Columns: 9]\n",
|
||
" orders [Rows: 100, Columns: 8]\n",
|
||
" order_items [Rows: 115, Columns: 8]\n",
|
||
" Relationships:\n",
|
||
" No relationships"
|
||
]
|
||
},
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"es = ft.EntitySet(id=\"orders\")\n",
|
||
"\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"customers\",\n",
|
||
" dataframe=customers,\n",
|
||
" index=\"customer_id\",\n",
|
||
" logical_types={\n",
|
||
" \"customer_unique_id\": Categorical,\n",
|
||
" \"customer_zip_code_prefix\": Categorical,\n",
|
||
" \"customer_city\": Categorical,\n",
|
||
" \"customer_state\": Categorical,\n",
|
||
" },\n",
|
||
")\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"sellers\",\n",
|
||
" dataframe=sellers,\n",
|
||
" index=\"seller_id\",\n",
|
||
" logical_types={\n",
|
||
" \"seller_zip_code_prefix\": Categorical,\n",
|
||
" \"seller_city\": Categorical,\n",
|
||
" \"seller_state\": Categorical,\n",
|
||
" },\n",
|
||
")\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"products\",\n",
|
||
" dataframe=products,\n",
|
||
" index=\"product_id\",\n",
|
||
" logical_types={\n",
|
||
" \"product_category_name\": Categorical,\n",
|
||
" \"product_name_lenght\": Categorical,\n",
|
||
" \"product_description_lenght\": Categorical,\n",
|
||
" \"product_photos_qty\": Categorical,\n",
|
||
" },\n",
|
||
")\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"orders\",\n",
|
||
" dataframe=orders,\n",
|
||
" index=\"order_id\",\n",
|
||
" logical_types={\n",
|
||
" \"order_status\": Categorical,\n",
|
||
" \"order_purchase_timestamp\": Datetime,\n",
|
||
" \"order_approved_at\": Datetime,\n",
|
||
" \"order_delivered_carrier_date\": Datetime,\n",
|
||
" \"order_delivered_customer_date\": Datetime,\n",
|
||
" \"order_estimated_delivery_date\": Datetime,\n",
|
||
" },\n",
|
||
")\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"order_items\",\n",
|
||
" dataframe=order_items,\n",
|
||
" index=\"orderitem_id\",\n",
|
||
" make_index=True,\n",
|
||
" logical_types={\"shipping_limit_date\": Datetime},\n",
|
||
")\n",
|
||
"\n",
|
||
"es"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Настройка связей между сущностями featuretools\n",
|
||
"\n",
|
||
"Настройка связей между таблицами на уровне ключей\n",
|
||
"\n",
|
||
"Связь указывается от родителя к потомкам (таблица-родитель, первичный ключ, таблица-потомок, внешний ключ)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Entityset: orders\n",
|
||
" DataFrames:\n",
|
||
" customers [Rows: 100, Columns: 5]\n",
|
||
" sellers [Rows: 87, Columns: 4]\n",
|
||
" products [Rows: 100, Columns: 9]\n",
|
||
" orders [Rows: 100, Columns: 8]\n",
|
||
" order_items [Rows: 115, Columns: 8]\n",
|
||
" Relationships:\n",
|
||
" orders.customer_id -> customers.customer_id\n",
|
||
" order_items.order_id -> orders.order_id\n",
|
||
" order_items.product_id -> products.product_id\n",
|
||
" order_items.seller_id -> sellers.seller_id"
|
||
]
|
||
},
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"es = es.add_relationship(\"customers\", \"customer_id\", \"orders\", \"customer_id\")\n",
|
||
"es = es.add_relationship(\"orders\", \"order_id\", \"order_items\", \"order_id\")\n",
|
||
"es = es.add_relationship(\"products\", \"product_id\", \"order_items\", \"product_id\")\n",
|
||
"es = es.add_relationship(\"sellers\", \"seller_id\", \"order_items\", \"seller_id\")\n",
|
||
"\n",
|
||
"es"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Автоматическое конструирование признаков с помощью featuretools\n",
|
||
"\n",
|
||
"Библиотека применят различные функции агрегации и трансформации к атрибутам таблицы order_items с учетом отношений\n",
|
||
"\n",
|
||
"Результат помещается в Dataframe feature_matrix"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\featuretools\\synthesis\\dfs.py:321: UnusedPrimitiveWarning: Some specified primitives were not used during DFS:\n",
|
||
" agg_primitives: ['any', 'mode']\n",
|
||
"This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.\n",
|
||
" warnings.warn(warning_msg, UnusedPrimitiveWarning)\n",
|
||
"c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function mean at 0x00000221E9563600> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
|
||
" ).agg(to_agg)\n",
|
||
"c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function mean at 0x00000221E9563600> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
|
||
" ).agg(to_agg)\n",
|
||
"c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function mean at 0x00000221E9563600> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
|
||
" ).agg(to_agg)\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>order_item_id</th>\n",
|
||
" <th>price</th>\n",
|
||
" <th>freight_value</th>\n",
|
||
" <th>HOUR(shipping_limit_date)</th>\n",
|
||
" <th>WEEKDAY(shipping_limit_date)</th>\n",
|
||
" <th>orders.order_status</th>\n",
|
||
" <th>products.product_category_name</th>\n",
|
||
" <th>products.product_name_lenght</th>\n",
|
||
" <th>products.product_description_lenght</th>\n",
|
||
" <th>products.product_photos_qty</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>orders.customers.customer_city</th>\n",
|
||
" <th>orders.customers.customer_state</th>\n",
|
||
" <th>products.COUNT(order_items)</th>\n",
|
||
" <th>products.MEAN(order_items.freight_value)</th>\n",
|
||
" <th>products.MEAN(order_items.order_item_id)</th>\n",
|
||
" <th>products.MEAN(order_items.price)</th>\n",
|
||
" <th>sellers.COUNT(order_items)</th>\n",
|
||
" <th>sellers.MEAN(order_items.freight_value)</th>\n",
|
||
" <th>sellers.MEAN(order_items.order_item_id)</th>\n",
|
||
" <th>sellers.MEAN(order_items.price)</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>orderitem_id</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>38.50</td>\n",
|
||
" <td>24.84</td>\n",
|
||
" <td>20</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>delivered</td>\n",
|
||
" <td>cama_mesa_banho</td>\n",
|
||
" <td>53.0</td>\n",
|
||
" <td>223.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>santa luzia</td>\n",
|
||
" <td>PB</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>24.84</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>38.50</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>21.340</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>61.200000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>29.99</td>\n",
|
||
" <td>7.39</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>delivered</td>\n",
|
||
" <td>telefonia</td>\n",
|
||
" <td>59.0</td>\n",
|
||
" <td>675.0</td>\n",
|
||
" <td>5.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>sao paulo</td>\n",
|
||
" <td>SP</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>7.39</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>29.99</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>7.390</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>29.990000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>110.99</td>\n",
|
||
" <td>21.27</td>\n",
|
||
" <td>21</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>delivered</td>\n",
|
||
" <td>cama_mesa_banho</td>\n",
|
||
" <td>52.0</td>\n",
|
||
" <td>413.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>gravatai</td>\n",
|
||
" <td>RS</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>21.27</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>110.99</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>21.270</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>110.990000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>27.99</td>\n",
|
||
" <td>15.10</td>\n",
|
||
" <td>23</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>delivered</td>\n",
|
||
" <td>telefonia</td>\n",
|
||
" <td>60.0</td>\n",
|
||
" <td>818.0</td>\n",
|
||
" <td>6.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>imbituba</td>\n",
|
||
" <td>SC</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>15.10</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>27.99</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>13.970</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>26.490000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>49.90</td>\n",
|
||
" <td>16.05</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>invoiced</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>santa rosa</td>\n",
|
||
" <td>RS</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>16.05</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>49.90</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>16.050</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>49.900000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>110</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>17.90</td>\n",
|
||
" <td>10.96</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>delivered</td>\n",
|
||
" <td>cama_mesa_banho</td>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>122.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>jundiai</td>\n",
|
||
" <td>SP</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>10.96</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>17.90</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>10.960</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>17.900000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>111</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>79.99</td>\n",
|
||
" <td>8.91</td>\n",
|
||
" <td>9</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>delivered</td>\n",
|
||
" <td>beleza_saude</td>\n",
|
||
" <td>59.0</td>\n",
|
||
" <td>492.0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>sao paulo</td>\n",
|
||
" <td>SP</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>8.91</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>79.99</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>13.206</td>\n",
|
||
" <td>1.2</td>\n",
|
||
" <td>54.590000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>112</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>190.00</td>\n",
|
||
" <td>19.41</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>delivered</td>\n",
|
||
" <td>climatizacao</td>\n",
|
||
" <td>60.0</td>\n",
|
||
" <td>3270.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>paulinia</td>\n",
|
||
" <td>SP</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>19.41</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>190.00</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>19.410</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>190.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>113</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>109.90</td>\n",
|
||
" <td>15.53</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>delivered</td>\n",
|
||
" <td>cool_stuff</td>\n",
|
||
" <td>46.0</td>\n",
|
||
" <td>595.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>rio de janeiro</td>\n",
|
||
" <td>RJ</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>15.53</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>109.90</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>15.530</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>109.900000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>114</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>27.90</td>\n",
|
||
" <td>18.30</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>delivered</td>\n",
|
||
" <td>alimentos</td>\n",
|
||
" <td>59.0</td>\n",
|
||
" <td>982.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>joinville</td>\n",
|
||
" <td>SC</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>16.70</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>27.90</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>16.190</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>38.596667</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>115 rows × 43 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" order_item_id price freight_value HOUR(shipping_limit_date) \\\n",
|
||
"orderitem_id \n",
|
||
"0 1 38.50 24.84 20 \n",
|
||
"1 1 29.99 7.39 8 \n",
|
||
"2 1 110.99 21.27 21 \n",
|
||
"3 1 27.99 15.10 23 \n",
|
||
"4 1 49.90 16.05 13 \n",
|
||
"... ... ... ... ... \n",
|
||
"110 1 17.90 10.96 8 \n",
|
||
"111 1 79.99 8.91 9 \n",
|
||
"112 1 190.00 19.41 13 \n",
|
||
"113 1 109.90 15.53 2 \n",
|
||
"114 1 27.90 18.30 14 \n",
|
||
"\n",
|
||
" WEEKDAY(shipping_limit_date) orders.order_status \\\n",
|
||
"orderitem_id \n",
|
||
"0 4 delivered \n",
|
||
"1 0 delivered \n",
|
||
"2 1 delivered \n",
|
||
"3 1 delivered \n",
|
||
"4 2 invoiced \n",
|
||
"... ... ... \n",
|
||
"110 1 delivered \n",
|
||
"111 4 delivered \n",
|
||
"112 3 delivered \n",
|
||
"113 2 delivered \n",
|
||
"114 2 delivered \n",
|
||
"\n",
|
||
" products.product_category_name products.product_name_lenght \\\n",
|
||
"orderitem_id \n",
|
||
"0 cama_mesa_banho 53.0 \n",
|
||
"1 telefonia 59.0 \n",
|
||
"2 cama_mesa_banho 52.0 \n",
|
||
"3 telefonia 60.0 \n",
|
||
"4 NaN NaN \n",
|
||
"... ... ... \n",
|
||
"110 cama_mesa_banho 55.0 \n",
|
||
"111 beleza_saude 59.0 \n",
|
||
"112 climatizacao 60.0 \n",
|
||
"113 cool_stuff 46.0 \n",
|
||
"114 alimentos 59.0 \n",
|
||
"\n",
|
||
" products.product_description_lenght products.product_photos_qty \\\n",
|
||
"orderitem_id \n",
|
||
"0 223.0 1.0 \n",
|
||
"1 675.0 5.0 \n",
|
||
"2 413.0 1.0 \n",
|
||
"3 818.0 6.0 \n",
|
||
"4 NaN NaN \n",
|
||
"... ... ... \n",
|
||
"110 122.0 1.0 \n",
|
||
"111 492.0 3.0 \n",
|
||
"112 3270.0 4.0 \n",
|
||
"113 595.0 2.0 \n",
|
||
"114 982.0 1.0 \n",
|
||
"\n",
|
||
" ... orders.customers.customer_city \\\n",
|
||
"orderitem_id ... \n",
|
||
"0 ... santa luzia \n",
|
||
"1 ... sao paulo \n",
|
||
"2 ... gravatai \n",
|
||
"3 ... imbituba \n",
|
||
"4 ... santa rosa \n",
|
||
"... ... ... \n",
|
||
"110 ... jundiai \n",
|
||
"111 ... sao paulo \n",
|
||
"112 ... paulinia \n",
|
||
"113 ... rio de janeiro \n",
|
||
"114 ... joinville \n",
|
||
"\n",
|
||
" orders.customers.customer_state products.COUNT(order_items) \\\n",
|
||
"orderitem_id \n",
|
||
"0 PB 1 \n",
|
||
"1 SP 1 \n",
|
||
"2 RS 1 \n",
|
||
"3 SC 1 \n",
|
||
"4 RS 1 \n",
|
||
"... ... ... \n",
|
||
"110 SP 1 \n",
|
||
"111 SP 1 \n",
|
||
"112 SP 1 \n",
|
||
"113 RJ 1 \n",
|
||
"114 SC 2 \n",
|
||
"\n",
|
||
" products.MEAN(order_items.freight_value) \\\n",
|
||
"orderitem_id \n",
|
||
"0 24.84 \n",
|
||
"1 7.39 \n",
|
||
"2 21.27 \n",
|
||
"3 15.10 \n",
|
||
"4 16.05 \n",
|
||
"... ... \n",
|
||
"110 10.96 \n",
|
||
"111 8.91 \n",
|
||
"112 19.41 \n",
|
||
"113 15.53 \n",
|
||
"114 16.70 \n",
|
||
"\n",
|
||
" products.MEAN(order_items.order_item_id) \\\n",
|
||
"orderitem_id \n",
|
||
"0 1.0 \n",
|
||
"1 1.0 \n",
|
||
"2 1.0 \n",
|
||
"3 1.0 \n",
|
||
"4 1.0 \n",
|
||
"... ... \n",
|
||
"110 1.0 \n",
|
||
"111 1.0 \n",
|
||
"112 1.0 \n",
|
||
"113 1.0 \n",
|
||
"114 1.0 \n",
|
||
"\n",
|
||
" products.MEAN(order_items.price) sellers.COUNT(order_items) \\\n",
|
||
"orderitem_id \n",
|
||
"0 38.50 2 \n",
|
||
"1 29.99 1 \n",
|
||
"2 110.99 1 \n",
|
||
"3 27.99 2 \n",
|
||
"4 49.90 1 \n",
|
||
"... ... ... \n",
|
||
"110 17.90 1 \n",
|
||
"111 79.99 5 \n",
|
||
"112 190.00 1 \n",
|
||
"113 109.90 1 \n",
|
||
"114 27.90 3 \n",
|
||
"\n",
|
||
" sellers.MEAN(order_items.freight_value) \\\n",
|
||
"orderitem_id \n",
|
||
"0 21.340 \n",
|
||
"1 7.390 \n",
|
||
"2 21.270 \n",
|
||
"3 13.970 \n",
|
||
"4 16.050 \n",
|
||
"... ... \n",
|
||
"110 10.960 \n",
|
||
"111 13.206 \n",
|
||
"112 19.410 \n",
|
||
"113 15.530 \n",
|
||
"114 16.190 \n",
|
||
"\n",
|
||
" sellers.MEAN(order_items.order_item_id) \\\n",
|
||
"orderitem_id \n",
|
||
"0 1.0 \n",
|
||
"1 1.0 \n",
|
||
"2 1.0 \n",
|
||
"3 1.0 \n",
|
||
"4 1.0 \n",
|
||
"... ... \n",
|
||
"110 1.0 \n",
|
||
"111 1.2 \n",
|
||
"112 1.0 \n",
|
||
"113 1.0 \n",
|
||
"114 1.0 \n",
|
||
"\n",
|
||
" sellers.MEAN(order_items.price) \n",
|
||
"orderitem_id \n",
|
||
"0 61.200000 \n",
|
||
"1 29.990000 \n",
|
||
"2 110.990000 \n",
|
||
"3 26.490000 \n",
|
||
"4 49.900000 \n",
|
||
"... ... \n",
|
||
"110 17.900000 \n",
|
||
"111 54.590000 \n",
|
||
"112 190.000000 \n",
|
||
"113 109.900000 \n",
|
||
"114 38.596667 \n",
|
||
"\n",
|
||
"[115 rows x 43 columns]"
|
||
]
|
||
},
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"feature_matrix, feature_defs = ft.dfs(\n",
|
||
" entityset=es,\n",
|
||
" target_dataframe_name=\"order_items\",\n",
|
||
" agg_primitives=[\"mean\", \"count\", \"mode\", \"any\"],\n",
|
||
" trans_primitives=[\"hour\", \"weekday\"],\n",
|
||
" max_depth=2,\n",
|
||
")\n",
|
||
"\n",
|
||
"feature_matrix"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Полученные признаки\n",
|
||
"\n",
|
||
"Список колонок полученного dataframe'а"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"[<Feature: order_item_id>,\n",
|
||
" <Feature: price>,\n",
|
||
" <Feature: freight_value>,\n",
|
||
" <Feature: HOUR(shipping_limit_date)>,\n",
|
||
" <Feature: WEEKDAY(shipping_limit_date)>,\n",
|
||
" <Feature: orders.order_status>,\n",
|
||
" <Feature: products.product_category_name>,\n",
|
||
" <Feature: products.product_name_lenght>,\n",
|
||
" <Feature: products.product_description_lenght>,\n",
|
||
" <Feature: products.product_photos_qty>,\n",
|
||
" <Feature: products.product_weight_g>,\n",
|
||
" <Feature: products.product_length_cm>,\n",
|
||
" <Feature: products.product_height_cm>,\n",
|
||
" <Feature: products.product_width_cm>,\n",
|
||
" <Feature: sellers.seller_zip_code_prefix>,\n",
|
||
" <Feature: sellers.seller_city>,\n",
|
||
" <Feature: sellers.seller_state>,\n",
|
||
" <Feature: orders.COUNT(order_items)>,\n",
|
||
" <Feature: orders.MEAN(order_items.freight_value)>,\n",
|
||
" <Feature: orders.MEAN(order_items.order_item_id)>,\n",
|
||
" <Feature: orders.MEAN(order_items.price)>,\n",
|
||
" <Feature: orders.HOUR(order_approved_at)>,\n",
|
||
" <Feature: orders.HOUR(order_delivered_carrier_date)>,\n",
|
||
" <Feature: orders.HOUR(order_delivered_customer_date)>,\n",
|
||
" <Feature: orders.HOUR(order_estimated_delivery_date)>,\n",
|
||
" <Feature: orders.HOUR(order_purchase_timestamp)>,\n",
|
||
" <Feature: orders.WEEKDAY(order_approved_at)>,\n",
|
||
" <Feature: orders.WEEKDAY(order_delivered_carrier_date)>,\n",
|
||
" <Feature: orders.WEEKDAY(order_delivered_customer_date)>,\n",
|
||
" <Feature: orders.WEEKDAY(order_estimated_delivery_date)>,\n",
|
||
" <Feature: orders.WEEKDAY(order_purchase_timestamp)>,\n",
|
||
" <Feature: orders.customers.customer_unique_id>,\n",
|
||
" <Feature: orders.customers.customer_zip_code_prefix>,\n",
|
||
" <Feature: orders.customers.customer_city>,\n",
|
||
" <Feature: orders.customers.customer_state>,\n",
|
||
" <Feature: products.COUNT(order_items)>,\n",
|
||
" <Feature: products.MEAN(order_items.freight_value)>,\n",
|
||
" <Feature: products.MEAN(order_items.order_item_id)>,\n",
|
||
" <Feature: products.MEAN(order_items.price)>,\n",
|
||
" <Feature: sellers.COUNT(order_items)>,\n",
|
||
" <Feature: sellers.MEAN(order_items.freight_value)>,\n",
|
||
" <Feature: sellers.MEAN(order_items.order_item_id)>,\n",
|
||
" <Feature: sellers.MEAN(order_items.price)>]"
|
||
]
|
||
},
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"feature_defs"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Отсечение значений признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Определение выбросов с помощью boxplot"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 148,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<Axes: >"
|
||
]
|
||
},
|
||
"execution_count": 148,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"titanic.boxplot(column=\"Age\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Отсечение данных для признака Возраст, значение которых больше 65 лет"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 149,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>AgeClip</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>34</th>\n",
|
||
" <td>Wheadon, Mr. Edward H</td>\n",
|
||
" <td>66.0</td>\n",
|
||
" <td>65.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>97</th>\n",
|
||
" <td>Goldschmidt, Mr. George B</td>\n",
|
||
" <td>71.0</td>\n",
|
||
" <td>65.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>117</th>\n",
|
||
" <td>Connors, Mr. Patrick</td>\n",
|
||
" <td>70.5</td>\n",
|
||
" <td>65.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>494</th>\n",
|
||
" <td>Artagaveytia, Mr. Ramon</td>\n",
|
||
" <td>71.0</td>\n",
|
||
" <td>65.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>631</th>\n",
|
||
" <td>Barkworth, Mr. Algernon Henry Wilson</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>65.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>673</th>\n",
|
||
" <td>Mitchell, Mr. Henry Michael</td>\n",
|
||
" <td>70.0</td>\n",
|
||
" <td>65.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>746</th>\n",
|
||
" <td>Crosby, Capt. Edward Gifford</td>\n",
|
||
" <td>70.0</td>\n",
|
||
" <td>65.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>852</th>\n",
|
||
" <td>Svensson, Mr. Johan</td>\n",
|
||
" <td>74.0</td>\n",
|
||
" <td>65.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Name Age AgeClip\n",
|
||
"34 Wheadon, Mr. Edward H 66.0 65.0\n",
|
||
"97 Goldschmidt, Mr. George B 71.0 65.0\n",
|
||
"117 Connors, Mr. Patrick 70.5 65.0\n",
|
||
"494 Artagaveytia, Mr. Ramon 71.0 65.0\n",
|
||
"631 Barkworth, Mr. Algernon Henry Wilson 80.0 65.0\n",
|
||
"673 Mitchell, Mr. Henry Michael 70.0 65.0\n",
|
||
"746 Crosby, Capt. Edward Gifford 70.0 65.0\n",
|
||
"852 Svensson, Mr. Johan 74.0 65.0"
|
||
]
|
||
},
|
||
"execution_count": 149,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"titanic_norm = titanic.copy()\n",
|
||
"\n",
|
||
"titanic_norm[\"AgeClip\"] = titanic[\"Age\"].clip(0, 65);\n",
|
||
"\n",
|
||
"titanic_norm[titanic_norm[\"Age\"] > 65][[\"Name\", \"Age\", \"AgeClip\"]]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Винсоризация признака Возраст"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 150,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"56.0\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>AgeWinsorize</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>34</th>\n",
|
||
" <td>Wheadon, Mr. Edward H</td>\n",
|
||
" <td>66.0</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>97</th>\n",
|
||
" <td>Goldschmidt, Mr. George B</td>\n",
|
||
" <td>71.0</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>117</th>\n",
|
||
" <td>Connors, Mr. Patrick</td>\n",
|
||
" <td>70.5</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>494</th>\n",
|
||
" <td>Artagaveytia, Mr. Ramon</td>\n",
|
||
" <td>71.0</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>631</th>\n",
|
||
" <td>Barkworth, Mr. Algernon Henry Wilson</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>673</th>\n",
|
||
" <td>Mitchell, Mr. Henry Michael</td>\n",
|
||
" <td>70.0</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>746</th>\n",
|
||
" <td>Crosby, Capt. Edward Gifford</td>\n",
|
||
" <td>70.0</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>852</th>\n",
|
||
" <td>Svensson, Mr. Johan</td>\n",
|
||
" <td>74.0</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Name Age AgeWinsorize\n",
|
||
"34 Wheadon, Mr. Edward H 66.0 54.0\n",
|
||
"97 Goldschmidt, Mr. George B 71.0 54.0\n",
|
||
"117 Connors, Mr. Patrick 70.5 54.0\n",
|
||
"494 Artagaveytia, Mr. Ramon 71.0 54.0\n",
|
||
"631 Barkworth, Mr. Algernon Henry Wilson 80.0 54.0\n",
|
||
"673 Mitchell, Mr. Henry Michael 70.0 54.0\n",
|
||
"746 Crosby, Capt. Edward Gifford 70.0 54.0\n",
|
||
"852 Svensson, Mr. Johan 74.0 54.0"
|
||
]
|
||
},
|
||
"execution_count": 150,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from scipy.stats.mstats import winsorize\n",
|
||
"\n",
|
||
"print(titanic_norm[\"Age\"].quantile(q=0.95))\n",
|
||
"\n",
|
||
"titanic_norm[\"AgeWinsorize\"] = winsorize(\n",
|
||
" titanic_norm[\"Age\"].fillna(titanic_norm[\"Age\"].mean()), (0, 0.05), inplace=False\n",
|
||
")\n",
|
||
"\n",
|
||
"titanic_norm[titanic_norm[\"Age\"] > 65][[\"Name\", \"Age\", \"AgeWinsorize\"]]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Нормализация значений"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 153,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>AgeNorm</th>\n",
|
||
" <th>AgeClipNorm</th>\n",
|
||
" <th>AgeWinsorizeNorm</th>\n",
|
||
" <th>AgeWinsorizeNorm2</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>Braund, Mr. Owen Harris</td>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>0.271174</td>\n",
|
||
" <td>0.334159</td>\n",
|
||
" <td>0.402762</td>\n",
|
||
" <td>-0.194476</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>0.472229</td>\n",
|
||
" <td>0.581914</td>\n",
|
||
" <td>0.701381</td>\n",
|
||
" <td>0.402762</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>Heikkinen, Miss. Laina</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>0.321438</td>\n",
|
||
" <td>0.396098</td>\n",
|
||
" <td>0.477417</td>\n",
|
||
" <td>-0.045166</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>0.434531</td>\n",
|
||
" <td>0.535460</td>\n",
|
||
" <td>0.645390</td>\n",
|
||
" <td>0.290780</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Allen, Mr. William Henry</td>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>0.434531</td>\n",
|
||
" <td>0.535460</td>\n",
|
||
" <td>0.645390</td>\n",
|
||
" <td>0.290780</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>Moran, Mr. James</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.546456</td>\n",
|
||
" <td>0.092912</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>McCarthy, Mr. Timothy J</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>0.673285</td>\n",
|
||
" <td>0.829669</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>Palsson, Master. Gosta Leonard</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0.019854</td>\n",
|
||
" <td>0.024466</td>\n",
|
||
" <td>0.029489</td>\n",
|
||
" <td>-0.941023</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)</td>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>0.334004</td>\n",
|
||
" <td>0.411583</td>\n",
|
||
" <td>0.496081</td>\n",
|
||
" <td>-0.007839</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>Nasser, Mrs. Nicholas (Adele Achem)</td>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>0.170646</td>\n",
|
||
" <td>0.210282</td>\n",
|
||
" <td>0.253453</td>\n",
|
||
" <td>-0.493094</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>Sandstrom, Miss. Marguerite Rut</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>0.044986</td>\n",
|
||
" <td>0.055435</td>\n",
|
||
" <td>0.066816</td>\n",
|
||
" <td>-0.866368</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>Bonnell, Miss. Elizabeth</td>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>0.723549</td>\n",
|
||
" <td>0.891607</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>Saundercock, Mr. William Henry</td>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>0.246042</td>\n",
|
||
" <td>0.303190</td>\n",
|
||
" <td>0.365435</td>\n",
|
||
" <td>-0.269130</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>Andersson, Mr. Anders Johan</td>\n",
|
||
" <td>39.0</td>\n",
|
||
" <td>0.484795</td>\n",
|
||
" <td>0.597399</td>\n",
|
||
" <td>0.720045</td>\n",
|
||
" <td>0.440090</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>Vestrom, Miss. Hulda Amanda Adolfina</td>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>0.170646</td>\n",
|
||
" <td>0.210282</td>\n",
|
||
" <td>0.253453</td>\n",
|
||
" <td>-0.493094</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>Hewlett, Mrs. (Mary D Kingcome)</td>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>0.685851</td>\n",
|
||
" <td>0.845153</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>Rice, Master. Eugene</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0.019854</td>\n",
|
||
" <td>0.024466</td>\n",
|
||
" <td>0.029489</td>\n",
|
||
" <td>-0.941023</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>Williams, Mr. Charles Eugene</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.546456</td>\n",
|
||
" <td>0.092912</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>Vander Planke, Mrs. Julius (Emelia Maria Vande...</td>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>0.384267</td>\n",
|
||
" <td>0.473521</td>\n",
|
||
" <td>0.570735</td>\n",
|
||
" <td>0.141471</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>Masselmani, Mrs. Fatima</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.546456</td>\n",
|
||
" <td>0.092912</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Name Age AgeNorm \\\n",
|
||
"1 Braund, Mr. Owen Harris 22.0 0.271174 \n",
|
||
"2 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 0.472229 \n",
|
||
"3 Heikkinen, Miss. Laina 26.0 0.321438 \n",
|
||
"4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 0.434531 \n",
|
||
"5 Allen, Mr. William Henry 35.0 0.434531 \n",
|
||
"6 Moran, Mr. James NaN NaN \n",
|
||
"7 McCarthy, Mr. Timothy J 54.0 0.673285 \n",
|
||
"8 Palsson, Master. Gosta Leonard 2.0 0.019854 \n",
|
||
"9 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 27.0 0.334004 \n",
|
||
"10 Nasser, Mrs. Nicholas (Adele Achem) 14.0 0.170646 \n",
|
||
"11 Sandstrom, Miss. Marguerite Rut 4.0 0.044986 \n",
|
||
"12 Bonnell, Miss. Elizabeth 58.0 0.723549 \n",
|
||
"13 Saundercock, Mr. William Henry 20.0 0.246042 \n",
|
||
"14 Andersson, Mr. Anders Johan 39.0 0.484795 \n",
|
||
"15 Vestrom, Miss. Hulda Amanda Adolfina 14.0 0.170646 \n",
|
||
"16 Hewlett, Mrs. (Mary D Kingcome) 55.0 0.685851 \n",
|
||
"17 Rice, Master. Eugene 2.0 0.019854 \n",
|
||
"18 Williams, Mr. Charles Eugene NaN NaN \n",
|
||
"19 Vander Planke, Mrs. Julius (Emelia Maria Vande... 31.0 0.384267 \n",
|
||
"20 Masselmani, Mrs. Fatima NaN NaN \n",
|
||
"\n",
|
||
" AgeClipNorm AgeWinsorizeNorm AgeWinsorizeNorm2 \n",
|
||
"1 0.334159 0.402762 -0.194476 \n",
|
||
"2 0.581914 0.701381 0.402762 \n",
|
||
"3 0.396098 0.477417 -0.045166 \n",
|
||
"4 0.535460 0.645390 0.290780 \n",
|
||
"5 0.535460 0.645390 0.290780 \n",
|
||
"6 NaN 0.546456 0.092912 \n",
|
||
"7 0.829669 1.000000 1.000000 \n",
|
||
"8 0.024466 0.029489 -0.941023 \n",
|
||
"9 0.411583 0.496081 -0.007839 \n",
|
||
"10 0.210282 0.253453 -0.493094 \n",
|
||
"11 0.055435 0.066816 -0.866368 \n",
|
||
"12 0.891607 1.000000 1.000000 \n",
|
||
"13 0.303190 0.365435 -0.269130 \n",
|
||
"14 0.597399 0.720045 0.440090 \n",
|
||
"15 0.210282 0.253453 -0.493094 \n",
|
||
"16 0.845153 1.000000 1.000000 \n",
|
||
"17 0.024466 0.029489 -0.941023 \n",
|
||
"18 NaN 0.546456 0.092912 \n",
|
||
"19 0.473521 0.570735 0.141471 \n",
|
||
"20 NaN 0.546456 0.092912 "
|
||
]
|
||
},
|
||
"execution_count": 153,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn import preprocessing\n",
|
||
"\n",
|
||
"min_max_scaler = preprocessing.MinMaxScaler()\n",
|
||
"\n",
|
||
"min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))\n",
|
||
"\n",
|
||
"titanic_norm[\"AgeNorm\"] = min_max_scaler.fit_transform(\n",
|
||
" titanic_norm[\"Age\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(titanic_norm[\"Age\"].shape)\n",
|
||
"\n",
|
||
"titanic_norm[\"AgeClipNorm\"] = min_max_scaler.fit_transform(\n",
|
||
" titanic_norm[\"AgeClip\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(titanic_norm[\"Age\"].shape)\n",
|
||
"\n",
|
||
"titanic_norm[\"AgeWinsorizeNorm\"] = min_max_scaler.fit_transform(\n",
|
||
" titanic_norm[\"AgeWinsorize\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(titanic_norm[\"Age\"].shape)\n",
|
||
"\n",
|
||
"titanic_norm[\"AgeWinsorizeNorm2\"] = min_max_scaler_2.fit_transform(\n",
|
||
" titanic_norm[\"AgeWinsorize\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(titanic_norm[\"Age\"].shape)\n",
|
||
"\n",
|
||
"titanic_norm[\n",
|
||
" [\"Name\", \"Age\", \"AgeNorm\", \"AgeClipNorm\", \"AgeWinsorizeNorm\", \"AgeWinsorizeNorm2\"]\n",
|
||
"].head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Стандартизация значений"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 152,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>AgeStand</th>\n",
|
||
" <th>AgeClipStand</th>\n",
|
||
" <th>AgeWinsorizeStand</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>Braund, Mr. Owen Harris</td>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>-0.530377</td>\n",
|
||
" <td>-0.532745</td>\n",
|
||
" <td>-0.606602</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>0.571831</td>\n",
|
||
" <td>0.585060</td>\n",
|
||
" <td>0.718863</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>Heikkinen, Miss. Laina</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>-0.254825</td>\n",
|
||
" <td>-0.253294</td>\n",
|
||
" <td>-0.275236</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>0.365167</td>\n",
|
||
" <td>0.375472</td>\n",
|
||
" <td>0.470339</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Allen, Mr. William Henry</td>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>0.365167</td>\n",
|
||
" <td>0.375472</td>\n",
|
||
" <td>0.470339</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>Moran, Mr. James</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.031205</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>McCarthy, Mr. Timothy J</td>\n",
|
||
" <td>54.0</td>\n",
|
||
" <td>1.674039</td>\n",
|
||
" <td>1.702866</td>\n",
|
||
" <td>2.044329</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>Palsson, Master. Gosta Leonard</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>-1.908136</td>\n",
|
||
" <td>-1.930003</td>\n",
|
||
" <td>-2.263435</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)</td>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>-0.185937</td>\n",
|
||
" <td>-0.183431</td>\n",
|
||
" <td>-0.192394</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>Nasser, Mrs. Nicholas (Adele Achem)</td>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>-1.081480</td>\n",
|
||
" <td>-1.091648</td>\n",
|
||
" <td>-1.269335</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>Sandstrom, Miss. Marguerite Rut</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>-1.770360</td>\n",
|
||
" <td>-1.790277</td>\n",
|
||
" <td>-2.097751</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>Bonnell, Miss. Elizabeth</td>\n",
|
||
" <td>58.0</td>\n",
|
||
" <td>1.949591</td>\n",
|
||
" <td>1.982317</td>\n",
|
||
" <td>2.044329</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>Saundercock, Mr. William Henry</td>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>-0.668153</td>\n",
|
||
" <td>-0.672471</td>\n",
|
||
" <td>-0.772286</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>Andersson, Mr. Anders Johan</td>\n",
|
||
" <td>39.0</td>\n",
|
||
" <td>0.640719</td>\n",
|
||
" <td>0.654923</td>\n",
|
||
" <td>0.801705</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>Vestrom, Miss. Hulda Amanda Adolfina</td>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>-1.081480</td>\n",
|
||
" <td>-1.091648</td>\n",
|
||
" <td>-1.269335</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>Hewlett, Mrs. (Mary D Kingcome)</td>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>1.742927</td>\n",
|
||
" <td>1.772729</td>\n",
|
||
" <td>2.044329</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>Rice, Master. Eugene</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>-1.908136</td>\n",
|
||
" <td>-1.930003</td>\n",
|
||
" <td>-2.263435</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>Williams, Mr. Charles Eugene</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.031205</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>Vander Planke, Mrs. Julius (Emelia Maria Vande...</td>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>0.089615</td>\n",
|
||
" <td>0.096020</td>\n",
|
||
" <td>0.138972</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>Masselmani, Mrs. Fatima</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.031205</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Name Age AgeStand \\\n",
|
||
"1 Braund, Mr. Owen Harris 22.0 -0.530377 \n",
|
||
"2 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 0.571831 \n",
|
||
"3 Heikkinen, Miss. Laina 26.0 -0.254825 \n",
|
||
"4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 0.365167 \n",
|
||
"5 Allen, Mr. William Henry 35.0 0.365167 \n",
|
||
"6 Moran, Mr. James NaN NaN \n",
|
||
"7 McCarthy, Mr. Timothy J 54.0 1.674039 \n",
|
||
"8 Palsson, Master. Gosta Leonard 2.0 -1.908136 \n",
|
||
"9 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 27.0 -0.185937 \n",
|
||
"10 Nasser, Mrs. Nicholas (Adele Achem) 14.0 -1.081480 \n",
|
||
"11 Sandstrom, Miss. Marguerite Rut 4.0 -1.770360 \n",
|
||
"12 Bonnell, Miss. Elizabeth 58.0 1.949591 \n",
|
||
"13 Saundercock, Mr. William Henry 20.0 -0.668153 \n",
|
||
"14 Andersson, Mr. Anders Johan 39.0 0.640719 \n",
|
||
"15 Vestrom, Miss. Hulda Amanda Adolfina 14.0 -1.081480 \n",
|
||
"16 Hewlett, Mrs. (Mary D Kingcome) 55.0 1.742927 \n",
|
||
"17 Rice, Master. Eugene 2.0 -1.908136 \n",
|
||
"18 Williams, Mr. Charles Eugene NaN NaN \n",
|
||
"19 Vander Planke, Mrs. Julius (Emelia Maria Vande... 31.0 0.089615 \n",
|
||
"20 Masselmani, Mrs. Fatima NaN NaN \n",
|
||
"\n",
|
||
" AgeClipStand AgeWinsorizeStand \n",
|
||
"1 -0.532745 -0.606602 \n",
|
||
"2 0.585060 0.718863 \n",
|
||
"3 -0.253294 -0.275236 \n",
|
||
"4 0.375472 0.470339 \n",
|
||
"5 0.375472 0.470339 \n",
|
||
"6 NaN 0.031205 \n",
|
||
"7 1.702866 2.044329 \n",
|
||
"8 -1.930003 -2.263435 \n",
|
||
"9 -0.183431 -0.192394 \n",
|
||
"10 -1.091648 -1.269335 \n",
|
||
"11 -1.790277 -2.097751 \n",
|
||
"12 1.982317 2.044329 \n",
|
||
"13 -0.672471 -0.772286 \n",
|
||
"14 0.654923 0.801705 \n",
|
||
"15 -1.091648 -1.269335 \n",
|
||
"16 1.772729 2.044329 \n",
|
||
"17 -1.930003 -2.263435 \n",
|
||
"18 NaN 0.031205 \n",
|
||
"19 0.096020 0.138972 \n",
|
||
"20 NaN 0.031205 "
|
||
]
|
||
},
|
||
"execution_count": 152,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn import preprocessing\n",
|
||
"\n",
|
||
"stndart_scaler = preprocessing.StandardScaler()\n",
|
||
"\n",
|
||
"titanic_norm[\"AgeStand\"] = stndart_scaler.fit_transform(\n",
|
||
" titanic_norm[\"Age\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(titanic_norm[\"Age\"].shape)\n",
|
||
"\n",
|
||
"titanic_norm[\"AgeClipStand\"] = stndart_scaler.fit_transform(\n",
|
||
" titanic_norm[\"AgeClip\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(titanic_norm[\"Age\"].shape)\n",
|
||
"\n",
|
||
"titanic_norm[\"AgeWinsorizeStand\"] = stndart_scaler.fit_transform(\n",
|
||
" titanic_norm[\"AgeWinsorize\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(titanic_norm[\"Age\"].shape)\n",
|
||
"\n",
|
||
"titanic_norm[[\"Name\", \"Age\", \"AgeStand\", \"AgeClipStand\", \"AgeWinsorizeStand\"]].head(20)"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": ".venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.7"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|