pred_analytics/lec3.ipynb
2025-01-13 14:42:39 +04:00

4279 lines
160 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Загрузка набора данных Titanic"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Name</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Ticket</th>\n",
" <th>Fare</th>\n",
" <th>Cabin</th>\n",
" <th>Embarked</th>\n",
" </tr>\n",
" <tr>\n",
" <th>PassengerId</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>Braund, Mr. Owen Harris</td>\n",
" <td>male</td>\n",
" <td>22.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>A/5 21171</td>\n",
" <td>7.2500</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
" <td>female</td>\n",
" <td>38.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>PC 17599</td>\n",
" <td>71.2833</td>\n",
" <td>C85</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>Heikkinen, Miss. Laina</td>\n",
" <td>female</td>\n",
" <td>26.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>STON/O2. 3101282</td>\n",
" <td>7.9250</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
" <td>female</td>\n",
" <td>35.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>113803</td>\n",
" <td>53.1000</td>\n",
" <td>C123</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>Allen, Mr. William Henry</td>\n",
" <td>male</td>\n",
" <td>35.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>373450</td>\n",
" <td>8.0500</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>887</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>Montvila, Rev. Juozas</td>\n",
" <td>male</td>\n",
" <td>27.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>211536</td>\n",
" <td>13.0000</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>888</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Graham, Miss. Margaret Edith</td>\n",
" <td>female</td>\n",
" <td>19.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>112053</td>\n",
" <td>30.0000</td>\n",
" <td>B42</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>889</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
" <td>female</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>W./C. 6607</td>\n",
" <td>23.4500</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>890</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Behr, Mr. Karl Howell</td>\n",
" <td>male</td>\n",
" <td>26.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>111369</td>\n",
" <td>30.0000</td>\n",
" <td>C148</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>891</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>Dooley, Mr. Patrick</td>\n",
" <td>male</td>\n",
" <td>32.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>370376</td>\n",
" <td>7.7500</td>\n",
" <td>NaN</td>\n",
" <td>Q</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>891 rows × 11 columns</p>\n",
"</div>"
],
"text/plain": [
" Survived Pclass \\\n",
"PassengerId \n",
"1 0 3 \n",
"2 1 1 \n",
"3 1 3 \n",
"4 1 1 \n",
"5 0 3 \n",
"... ... ... \n",
"887 0 2 \n",
"888 1 1 \n",
"889 0 3 \n",
"890 1 1 \n",
"891 0 3 \n",
"\n",
" Name Sex Age \\\n",
"PassengerId \n",
"1 Braund, Mr. Owen Harris male 22.0 \n",
"2 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 \n",
"3 Heikkinen, Miss. Laina female 26.0 \n",
"4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 \n",
"5 Allen, Mr. William Henry male 35.0 \n",
"... ... ... ... \n",
"887 Montvila, Rev. Juozas male 27.0 \n",
"888 Graham, Miss. Margaret Edith female 19.0 \n",
"889 Johnston, Miss. Catherine Helen \"Carrie\" female NaN \n",
"890 Behr, Mr. Karl Howell male 26.0 \n",
"891 Dooley, Mr. Patrick male 32.0 \n",
"\n",
" SibSp Parch Ticket Fare Cabin Embarked \n",
"PassengerId \n",
"1 1 0 A/5 21171 7.2500 NaN S \n",
"2 1 0 PC 17599 71.2833 C85 C \n",
"3 0 0 STON/O2. 3101282 7.9250 NaN S \n",
"4 1 0 113803 53.1000 C123 S \n",
"5 0 0 373450 8.0500 NaN S \n",
"... ... ... ... ... ... ... \n",
"887 0 0 211536 13.0000 NaN S \n",
"888 0 0 112053 30.0000 B42 S \n",
"889 1 2 W./C. 6607 23.4500 NaN S \n",
"890 0 0 111369 30.0000 C148 C \n",
"891 0 0 370376 7.7500 NaN Q \n",
"\n",
"[891 rows x 11 columns]"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"titanic = pd.read_csv(\"data/titanic.csv\", index_col=\"PassengerId\")\n",
"\n",
"titanic"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Унитарное кодирование\n",
"\n",
"Преобразование категориального признака в несколько бинарных признаков"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Унитарное кодирование признаков Пол (Sex) и Порт посадки (Embarked)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Кодирование"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Embarked_Q</th>\n",
" <th>Embarked_S</th>\n",
" <th>Embarked_nan</th>\n",
" <th>Sex_male</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>886</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>887</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>888</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>889</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>890</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>891 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" Embarked_Q Embarked_S Embarked_nan Sex_male\n",
"0 0.0 1.0 0.0 1.0\n",
"1 0.0 0.0 0.0 0.0\n",
"2 0.0 1.0 0.0 0.0\n",
"3 0.0 1.0 0.0 0.0\n",
"4 0.0 1.0 0.0 1.0\n",
".. ... ... ... ...\n",
"886 0.0 1.0 0.0 1.0\n",
"887 0.0 1.0 0.0 0.0\n",
"888 0.0 1.0 0.0 0.0\n",
"889 0.0 0.0 0.0 1.0\n",
"890 1.0 0.0 0.0 1.0\n",
"\n",
"[891 rows x 4 columns]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import OneHotEncoder\n",
"import numpy as np\n",
"\n",
"encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
"\n",
"encoded_values = encoder.fit_transform(titanic[[\"Embarked\", \"Sex\"]])\n",
"\n",
"encoded_columns = encoder.get_feature_names_out([\"Embarked\", \"Sex\"])\n",
"\n",
"encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
"\n",
"encoded_values_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Добавление признаков в исходный Dataframe"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Name</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Ticket</th>\n",
" <th>Fare</th>\n",
" <th>Cabin</th>\n",
" <th>Embarked</th>\n",
" <th>Embarked_Q</th>\n",
" <th>Embarked_S</th>\n",
" <th>Embarked_nan</th>\n",
" <th>Sex_male</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" <td>Braund, Mr. Owen Harris</td>\n",
" <td>male</td>\n",
" <td>22.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>A/5 21171</td>\n",
" <td>7.2500</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
" <td>female</td>\n",
" <td>38.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>PC 17599</td>\n",
" <td>71.2833</td>\n",
" <td>C85</td>\n",
" <td>C</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1.0</td>\n",
" <td>3.0</td>\n",
" <td>Heikkinen, Miss. Laina</td>\n",
" <td>female</td>\n",
" <td>26.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>STON/O2. 3101282</td>\n",
" <td>7.9250</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
" <td>female</td>\n",
" <td>35.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>113803</td>\n",
" <td>53.1000</td>\n",
" <td>C123</td>\n",
" <td>S</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" <td>Allen, Mr. William Henry</td>\n",
" <td>male</td>\n",
" <td>35.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>373450</td>\n",
" <td>8.0500</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>888</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>Graham, Miss. Margaret Edith</td>\n",
" <td>female</td>\n",
" <td>19.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>112053</td>\n",
" <td>30.0000</td>\n",
" <td>B42</td>\n",
" <td>S</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>889</th>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
" <td>female</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>W./C. 6607</td>\n",
" <td>23.4500</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>890</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>Behr, Mr. Karl Howell</td>\n",
" <td>male</td>\n",
" <td>26.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>111369</td>\n",
" <td>30.0000</td>\n",
" <td>C148</td>\n",
" <td>C</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>891</th>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" <td>Dooley, Mr. Patrick</td>\n",
" <td>male</td>\n",
" <td>32.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>370376</td>\n",
" <td>7.7500</td>\n",
" <td>NaN</td>\n",
" <td>Q</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>892 rows × 15 columns</p>\n",
"</div>"
],
"text/plain": [
" Survived Pclass Name \\\n",
"1 0.0 3.0 Braund, Mr. Owen Harris \n",
"2 1.0 1.0 Cumings, Mrs. John Bradley (Florence Briggs Th... \n",
"3 1.0 3.0 Heikkinen, Miss. Laina \n",
"4 1.0 1.0 Futrelle, Mrs. Jacques Heath (Lily May Peel) \n",
"5 0.0 3.0 Allen, Mr. William Henry \n",
".. ... ... ... \n",
"888 1.0 1.0 Graham, Miss. Margaret Edith \n",
"889 0.0 3.0 Johnston, Miss. Catherine Helen \"Carrie\" \n",
"890 1.0 1.0 Behr, Mr. Karl Howell \n",
"891 0.0 3.0 Dooley, Mr. Patrick \n",
"0 NaN NaN NaN \n",
"\n",
" Sex Age SibSp Parch Ticket Fare Cabin Embarked \\\n",
"1 male 22.0 1.0 0.0 A/5 21171 7.2500 NaN S \n",
"2 female 38.0 1.0 0.0 PC 17599 71.2833 C85 C \n",
"3 female 26.0 0.0 0.0 STON/O2. 3101282 7.9250 NaN S \n",
"4 female 35.0 1.0 0.0 113803 53.1000 C123 S \n",
"5 male 35.0 0.0 0.0 373450 8.0500 NaN S \n",
".. ... ... ... ... ... ... ... ... \n",
"888 female 19.0 0.0 0.0 112053 30.0000 B42 S \n",
"889 female NaN 1.0 2.0 W./C. 6607 23.4500 NaN S \n",
"890 male 26.0 0.0 0.0 111369 30.0000 C148 C \n",
"891 male 32.0 0.0 0.0 370376 7.7500 NaN Q \n",
"0 NaN NaN NaN NaN NaN NaN NaN NaN \n",
"\n",
" Embarked_Q Embarked_S Embarked_nan Sex_male \n",
"1 0.0 0.0 0.0 0.0 \n",
"2 0.0 1.0 0.0 0.0 \n",
"3 0.0 1.0 0.0 0.0 \n",
"4 0.0 1.0 0.0 1.0 \n",
"5 1.0 0.0 0.0 1.0 \n",
".. ... ... ... ... \n",
"888 0.0 1.0 0.0 0.0 \n",
"889 0.0 0.0 0.0 1.0 \n",
"890 1.0 0.0 0.0 1.0 \n",
"891 NaN NaN NaN NaN \n",
"0 0.0 1.0 0.0 1.0 \n",
"\n",
"[892 rows x 15 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"titanic = pd.concat([titanic, encoded_values_df], axis=1)\n",
"\n",
"titanic"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Дискретизация признаков"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Равномерное разделение данных на 3 группы"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"labels = [\"young\", \"middle-aged\", \"old\"]\n",
"num_bins = 3"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([ 0.42 , 26.94666667, 53.47333333, 80. ]),\n",
" array([319, 523, 50]))"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hist1, bins1 = np.histogram(titanic[\"Age\"].fillna(titanic[\"Age\"].median()), bins=num_bins)\n",
"bins1, hist1"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age</th>\n",
" <th>Age</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>22.0</td>\n",
" <td>(0.42, 26.947]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>38.0</td>\n",
" <td>(26.947, 53.473]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>26.0</td>\n",
" <td>(0.42, 26.947]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>35.0</td>\n",
" <td>(26.947, 53.473]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>35.0</td>\n",
" <td>(26.947, 53.473]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>54.0</td>\n",
" <td>(53.473, 80.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>2.0</td>\n",
" <td>(0.42, 26.947]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>27.0</td>\n",
" <td>(26.947, 53.473]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>14.0</td>\n",
" <td>(0.42, 26.947]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>4.0</td>\n",
" <td>(0.42, 26.947]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>58.0</td>\n",
" <td>(53.473, 80.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>20.0</td>\n",
" <td>(0.42, 26.947]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>39.0</td>\n",
" <td>(26.947, 53.473]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>14.0</td>\n",
" <td>(0.42, 26.947]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>55.0</td>\n",
" <td>(53.473, 80.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>2.0</td>\n",
" <td>(0.42, 26.947]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>31.0</td>\n",
" <td>(26.947, 53.473]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Age Age\n",
"1 22.0 (0.42, 26.947]\n",
"2 38.0 (26.947, 53.473]\n",
"3 26.0 (0.42, 26.947]\n",
"4 35.0 (26.947, 53.473]\n",
"5 35.0 (26.947, 53.473]\n",
"6 NaN NaN\n",
"7 54.0 (53.473, 80.0]\n",
"8 2.0 (0.42, 26.947]\n",
"9 27.0 (26.947, 53.473]\n",
"10 14.0 (0.42, 26.947]\n",
"11 4.0 (0.42, 26.947]\n",
"12 58.0 (53.473, 80.0]\n",
"13 20.0 (0.42, 26.947]\n",
"14 39.0 (26.947, 53.473]\n",
"15 14.0 (0.42, 26.947]\n",
"16 55.0 (53.473, 80.0]\n",
"17 2.0 (0.42, 26.947]\n",
"18 NaN NaN\n",
"19 31.0 (26.947, 53.473]\n",
"20 NaN NaN"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins1))], axis=1).head(20)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age</th>\n",
" <th>Age</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>22.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>38.0</td>\n",
" <td>middle-aged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>26.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>35.0</td>\n",
" <td>middle-aged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>35.0</td>\n",
" <td>middle-aged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>54.0</td>\n",
" <td>old</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>2.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>27.0</td>\n",
" <td>middle-aged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>14.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>4.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>58.0</td>\n",
" <td>old</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>20.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>39.0</td>\n",
" <td>middle-aged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>14.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>55.0</td>\n",
" <td>old</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>2.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>31.0</td>\n",
" <td>middle-aged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Age Age\n",
"1 22.0 young\n",
"2 38.0 middle-aged\n",
"3 26.0 young\n",
"4 35.0 middle-aged\n",
"5 35.0 middle-aged\n",
"6 NaN NaN\n",
"7 54.0 old\n",
"8 2.0 young\n",
"9 27.0 middle-aged\n",
"10 14.0 young\n",
"11 4.0 young\n",
"12 58.0 old\n",
"13 20.0 young\n",
"14 39.0 middle-aged\n",
"15 14.0 young\n",
"16 55.0 old\n",
"17 2.0 young\n",
"18 NaN NaN\n",
"19 31.0 middle-aged\n",
"20 NaN NaN"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins1), labels=labels)], axis=1).head(20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Равномерное разделение данных на 3 группы c установкой собственной границы диапазона значений (от 0 до 100)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([ 0. , 33.33333333, 66.66666667, 100. ]),\n",
" array([641, 244, 7]))"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bins2 = np.linspace(0, 100, 4)\n",
"tmp_bins2 = np.digitize(titanic[\"Age\"].fillna(titanic[\"Age\"].median()), bins2)\n",
"hist2 = np.bincount(tmp_bins2 - 1)\n",
"bins2, hist2"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age</th>\n",
" <th>Age</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>22.0</td>\n",
" <td>(0.0, 33.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>38.0</td>\n",
" <td>(33.333, 66.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>26.0</td>\n",
" <td>(0.0, 33.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>35.0</td>\n",
" <td>(33.333, 66.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>35.0</td>\n",
" <td>(33.333, 66.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>54.0</td>\n",
" <td>(33.333, 66.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>2.0</td>\n",
" <td>(0.0, 33.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>27.0</td>\n",
" <td>(0.0, 33.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>14.0</td>\n",
" <td>(0.0, 33.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>4.0</td>\n",
" <td>(0.0, 33.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>58.0</td>\n",
" <td>(33.333, 66.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>20.0</td>\n",
" <td>(0.0, 33.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>39.0</td>\n",
" <td>(33.333, 66.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>14.0</td>\n",
" <td>(0.0, 33.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>55.0</td>\n",
" <td>(33.333, 66.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>2.0</td>\n",
" <td>(0.0, 33.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>31.0</td>\n",
" <td>(0.0, 33.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Age Age\n",
"1 22.0 (0.0, 33.333]\n",
"2 38.0 (33.333, 66.667]\n",
"3 26.0 (0.0, 33.333]\n",
"4 35.0 (33.333, 66.667]\n",
"5 35.0 (33.333, 66.667]\n",
"6 NaN NaN\n",
"7 54.0 (33.333, 66.667]\n",
"8 2.0 (0.0, 33.333]\n",
"9 27.0 (0.0, 33.333]\n",
"10 14.0 (0.0, 33.333]\n",
"11 4.0 (0.0, 33.333]\n",
"12 58.0 (33.333, 66.667]\n",
"13 20.0 (0.0, 33.333]\n",
"14 39.0 (33.333, 66.667]\n",
"15 14.0 (0.0, 33.333]\n",
"16 55.0 (33.333, 66.667]\n",
"17 2.0 (0.0, 33.333]\n",
"18 NaN NaN\n",
"19 31.0 (0.0, 33.333]\n",
"20 NaN NaN"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins2))], axis=1).head(20)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age</th>\n",
" <th>Age</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>22.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>38.0</td>\n",
" <td>middle-aged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>26.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>35.0</td>\n",
" <td>middle-aged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>35.0</td>\n",
" <td>middle-aged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>54.0</td>\n",
" <td>middle-aged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>2.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>27.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>14.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>4.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>58.0</td>\n",
" <td>middle-aged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>20.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>39.0</td>\n",
" <td>middle-aged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>14.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>55.0</td>\n",
" <td>middle-aged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>2.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>31.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Age Age\n",
"1 22.0 young\n",
"2 38.0 middle-aged\n",
"3 26.0 young\n",
"4 35.0 middle-aged\n",
"5 35.0 middle-aged\n",
"6 NaN NaN\n",
"7 54.0 middle-aged\n",
"8 2.0 young\n",
"9 27.0 young\n",
"10 14.0 young\n",
"11 4.0 young\n",
"12 58.0 middle-aged\n",
"13 20.0 young\n",
"14 39.0 middle-aged\n",
"15 14.0 young\n",
"16 55.0 middle-aged\n",
"17 2.0 young\n",
"18 NaN NaN\n",
"19 31.0 young\n",
"20 NaN NaN"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins2), labels=labels)], axis=1).head(20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Равномерное разделение данных на 3 группы c установкой собственных интервалов (0 - 39, 40 - 60, 61 - 100)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([ 0, 40, 60, 100]), array([729, 137, 26]))"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hist3, bins3 = np.histogram(\n",
" titanic[\"Age\"].fillna(titanic[\"Age\"].median()), bins=[0, 40, 60, 100]\n",
")\n",
"bins3, hist3"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age</th>\n",
" <th>Age</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>22.0</td>\n",
" <td>(0.0, 40.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>38.0</td>\n",
" <td>(0.0, 40.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>26.0</td>\n",
" <td>(0.0, 40.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>35.0</td>\n",
" <td>(0.0, 40.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>35.0</td>\n",
" <td>(0.0, 40.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>54.0</td>\n",
" <td>(40.0, 60.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>2.0</td>\n",
" <td>(0.0, 40.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>27.0</td>\n",
" <td>(0.0, 40.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>14.0</td>\n",
" <td>(0.0, 40.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>4.0</td>\n",
" <td>(0.0, 40.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>58.0</td>\n",
" <td>(40.0, 60.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>20.0</td>\n",
" <td>(0.0, 40.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>39.0</td>\n",
" <td>(0.0, 40.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>14.0</td>\n",
" <td>(0.0, 40.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>55.0</td>\n",
" <td>(40.0, 60.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>2.0</td>\n",
" <td>(0.0, 40.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>31.0</td>\n",
" <td>(0.0, 40.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Age Age\n",
"1 22.0 (0.0, 40.0]\n",
"2 38.0 (0.0, 40.0]\n",
"3 26.0 (0.0, 40.0]\n",
"4 35.0 (0.0, 40.0]\n",
"5 35.0 (0.0, 40.0]\n",
"6 NaN NaN\n",
"7 54.0 (40.0, 60.0]\n",
"8 2.0 (0.0, 40.0]\n",
"9 27.0 (0.0, 40.0]\n",
"10 14.0 (0.0, 40.0]\n",
"11 4.0 (0.0, 40.0]\n",
"12 58.0 (40.0, 60.0]\n",
"13 20.0 (0.0, 40.0]\n",
"14 39.0 (0.0, 40.0]\n",
"15 14.0 (0.0, 40.0]\n",
"16 55.0 (40.0, 60.0]\n",
"17 2.0 (0.0, 40.0]\n",
"18 NaN NaN\n",
"19 31.0 (0.0, 40.0]\n",
"20 NaN NaN"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins3))], axis=1).head(20)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age</th>\n",
" <th>Age</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>22.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>38.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>26.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>35.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>35.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>54.0</td>\n",
" <td>middle-aged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>2.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>27.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>14.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>4.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>58.0</td>\n",
" <td>middle-aged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>20.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>39.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>14.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>55.0</td>\n",
" <td>middle-aged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>2.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>31.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Age Age\n",
"1 22.0 young\n",
"2 38.0 young\n",
"3 26.0 young\n",
"4 35.0 young\n",
"5 35.0 young\n",
"6 NaN NaN\n",
"7 54.0 middle-aged\n",
"8 2.0 young\n",
"9 27.0 young\n",
"10 14.0 young\n",
"11 4.0 young\n",
"12 58.0 middle-aged\n",
"13 20.0 young\n",
"14 39.0 young\n",
"15 14.0 young\n",
"16 55.0 middle-aged\n",
"17 2.0 young\n",
"18 NaN NaN\n",
"19 31.0 young\n",
"20 NaN NaN"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins3), labels=labels)], axis=1).head(20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Квантильное разделение данных на 3 группы"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age</th>\n",
" <th>Age</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>22.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>38.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>26.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>35.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>35.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>54.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>27.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>14.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>58.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>20.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>39.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>14.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>55.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>31.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Age Age\n",
"1 22.0 0.0\n",
"2 38.0 2.0\n",
"3 26.0 1.0\n",
"4 35.0 2.0\n",
"5 35.0 2.0\n",
"6 NaN NaN\n",
"7 54.0 2.0\n",
"8 2.0 0.0\n",
"9 27.0 1.0\n",
"10 14.0 0.0\n",
"11 4.0 0.0\n",
"12 58.0 2.0\n",
"13 20.0 0.0\n",
"14 39.0 2.0\n",
"15 14.0 0.0\n",
"16 55.0 2.0\n",
"17 2.0 0.0\n",
"18 NaN NaN\n",
"19 31.0 1.0\n",
"20 NaN NaN"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([titanic[\"Age\"], pd.qcut(titanic[\"Age\"], q=3, labels=False)], axis=1).head(20)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age</th>\n",
" <th>Age</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>22.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>38.0</td>\n",
" <td>old</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>26.0</td>\n",
" <td>middle-aged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>35.0</td>\n",
" <td>old</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>35.0</td>\n",
" <td>old</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>54.0</td>\n",
" <td>old</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>2.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>27.0</td>\n",
" <td>middle-aged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>14.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>4.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>58.0</td>\n",
" <td>old</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>20.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>39.0</td>\n",
" <td>old</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>14.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>55.0</td>\n",
" <td>old</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>2.0</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>31.0</td>\n",
" <td>middle-aged</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Age Age\n",
"1 22.0 young\n",
"2 38.0 old\n",
"3 26.0 middle-aged\n",
"4 35.0 old\n",
"5 35.0 old\n",
"6 NaN NaN\n",
"7 54.0 old\n",
"8 2.0 young\n",
"9 27.0 middle-aged\n",
"10 14.0 young\n",
"11 4.0 young\n",
"12 58.0 old\n",
"13 20.0 young\n",
"14 39.0 old\n",
"15 14.0 young\n",
"16 55.0 old\n",
"17 2.0 young\n",
"18 NaN NaN\n",
"19 31.0 middle-aged\n",
"20 NaN NaN"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([titanic[\"Age\"], pd.qcut(titanic[\"Age\"], q=3, labels=labels)], axis=1).head(20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Пример конструирования признаков на основе существующих\n",
"\n",
"Title - обращение к пассажиру (Mr, Mrs, Miss)\n",
"\n",
"Is_married - замужняя ли женщина\n",
"\n",
"Cabin_type - палуба (тип каюты)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Name</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Ticket</th>\n",
" <th>Fare</th>\n",
" <th>Cabin</th>\n",
" <th>Embarked</th>\n",
" <th>Title</th>\n",
" <th>Is_married</th>\n",
" <th>Cabin_type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
" <td>female</td>\n",
" <td>38.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>PC 17599</td>\n",
" <td>71.2833</td>\n",
" <td>C85</td>\n",
" <td>C</td>\n",
" <td>Mrs</td>\n",
" <td>1</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
" <td>female</td>\n",
" <td>35.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>113803</td>\n",
" <td>53.1000</td>\n",
" <td>C123</td>\n",
" <td>S</td>\n",
" <td>Mrs</td>\n",
" <td>1</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>McCarthy, Mr. Timothy J</td>\n",
" <td>male</td>\n",
" <td>54.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>17463</td>\n",
" <td>51.8625</td>\n",
" <td>E46</td>\n",
" <td>S</td>\n",
" <td>Mr</td>\n",
" <td>0</td>\n",
" <td>E</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>1.0</td>\n",
" <td>3.0</td>\n",
" <td>Sandstrom, Miss. Marguerite Rut</td>\n",
" <td>female</td>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>PP 9549</td>\n",
" <td>16.7000</td>\n",
" <td>G6</td>\n",
" <td>S</td>\n",
" <td>Miss</td>\n",
" <td>0</td>\n",
" <td>G</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>Bonnell, Miss. Elizabeth</td>\n",
" <td>female</td>\n",
" <td>58.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>113783</td>\n",
" <td>26.5500</td>\n",
" <td>C103</td>\n",
" <td>S</td>\n",
" <td>Miss</td>\n",
" <td>0</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>872</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>Beckwith, Mrs. Richard Leonard (Sallie Monypeny)</td>\n",
" <td>female</td>\n",
" <td>47.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>11751</td>\n",
" <td>52.5542</td>\n",
" <td>D35</td>\n",
" <td>S</td>\n",
" <td>Mrs</td>\n",
" <td>1</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>873</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>Carlsson, Mr. Frans Olof</td>\n",
" <td>male</td>\n",
" <td>33.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>695</td>\n",
" <td>5.0000</td>\n",
" <td>B51 B53 B55</td>\n",
" <td>S</td>\n",
" <td>Mr</td>\n",
" <td>0</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>880</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)</td>\n",
" <td>female</td>\n",
" <td>56.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>11767</td>\n",
" <td>83.1583</td>\n",
" <td>C50</td>\n",
" <td>C</td>\n",
" <td>Mrs</td>\n",
" <td>1</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>888</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>Graham, Miss. Margaret Edith</td>\n",
" <td>female</td>\n",
" <td>19.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>112053</td>\n",
" <td>30.0000</td>\n",
" <td>B42</td>\n",
" <td>S</td>\n",
" <td>Miss</td>\n",
" <td>0</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>890</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>Behr, Mr. Karl Howell</td>\n",
" <td>male</td>\n",
" <td>26.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>111369</td>\n",
" <td>30.0000</td>\n",
" <td>C148</td>\n",
" <td>C</td>\n",
" <td>Mr</td>\n",
" <td>0</td>\n",
" <td>C</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>183 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" Survived Pclass Name \\\n",
"2 1.0 1.0 Cumings, Mrs. John Bradley (Florence Briggs Th... \n",
"4 1.0 1.0 Futrelle, Mrs. Jacques Heath (Lily May Peel) \n",
"7 0.0 1.0 McCarthy, Mr. Timothy J \n",
"11 1.0 3.0 Sandstrom, Miss. Marguerite Rut \n",
"12 1.0 1.0 Bonnell, Miss. Elizabeth \n",
".. ... ... ... \n",
"872 1.0 1.0 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) \n",
"873 0.0 1.0 Carlsson, Mr. Frans Olof \n",
"880 1.0 1.0 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) \n",
"888 1.0 1.0 Graham, Miss. Margaret Edith \n",
"890 1.0 1.0 Behr, Mr. Karl Howell \n",
"\n",
" Sex Age SibSp Parch Ticket Fare Cabin Embarked \\\n",
"2 female 38.0 1.0 0.0 PC 17599 71.2833 C85 C \n",
"4 female 35.0 1.0 0.0 113803 53.1000 C123 S \n",
"7 male 54.0 0.0 0.0 17463 51.8625 E46 S \n",
"11 female 4.0 1.0 1.0 PP 9549 16.7000 G6 S \n",
"12 female 58.0 0.0 0.0 113783 26.5500 C103 S \n",
".. ... ... ... ... ... ... ... ... \n",
"872 female 47.0 1.0 1.0 11751 52.5542 D35 S \n",
"873 male 33.0 0.0 0.0 695 5.0000 B51 B53 B55 S \n",
"880 female 56.0 0.0 1.0 11767 83.1583 C50 C \n",
"888 female 19.0 0.0 0.0 112053 30.0000 B42 S \n",
"890 male 26.0 0.0 0.0 111369 30.0000 C148 C \n",
"\n",
" Title Is_married Cabin_type \n",
"2 Mrs 1 C \n",
"4 Mrs 1 C \n",
"7 Mr 0 E \n",
"11 Miss 0 G \n",
"12 Miss 0 C \n",
".. ... ... ... \n",
"872 Mrs 1 D \n",
"873 Mr 0 B \n",
"880 Mrs 1 C \n",
"888 Miss 0 B \n",
"890 Mr 0 C \n",
"\n",
"[183 rows x 14 columns]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"titanic_cl = titanic.drop(\n",
" [\"Embarked_Q\", \"Embarked_S\", \"Embarked_nan\", \"Sex_male\"], axis=1, errors=\"ignore\"\n",
")\n",
"titanic_cl = titanic_cl.dropna()\n",
"\n",
"titanic_cl[\"Title\"] = [\n",
" i.split(\",\")[1].split(\".\")[0].strip() for i in titanic_cl[\"Name\"]\n",
"]\n",
"\n",
"titanic_cl[\"Is_married\"] = [1 if i == \"Mrs\" else 0 for i in titanic_cl[\"Title\"]]\n",
"\n",
"titanic_cl[\"Cabin_type\"] = [i[0] for i in titanic_cl[\"Cabin\"]]\n",
"\n",
"titanic_cl"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Пример использования библиотеки Featuretools для автоматического конструирования (синтеза) признаков\n",
"\n",
"https://featuretools.alteryx.com/en/stable/getting_started/using_entitysets.html"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Загрузка данных\n",
"\n",
"За основу был взят набор данных \"Ecommerce Orders Data Set\" из Kaggle\n",
"\n",
"Используется только 100 первых заказов и связанные с ними объекты\n",
"\n",
"https://www.kaggle.com/datasets/sangamsharmait/ecommerce-orders-data-analysis"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"import featuretools as ft\n",
"from woodwork.logical_types import Categorical, Datetime\n",
"\n",
"customers = pd.read_csv(\"data/orders/customers.csv\")\n",
"sellers = pd.read_csv(\"data/orders/sellers.csv\")\n",
"products = pd.read_csv(\"data/orders/products.csv\")\n",
"orders = pd.read_csv(\"data/orders/orders.csv\")\n",
"orders.fillna({\"order_delivered_carrier_date\": pd.to_datetime(\n",
" \"1900-01-01 00:00:00\"\n",
")}, inplace=True)\n",
"orders.fillna(\n",
" {\"order_delivered_customer_date\": pd.to_datetime(\"1900-01-01 00:00:00\")},\n",
" inplace=True,\n",
")\n",
"order_items = pd.read_csv(\"data/orders/order_items.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Создание сущностей в featuretools\n",
"\n",
"Добавление dataframe'ов с данными в EntitySet с указанием параметров: название сущности (таблицы), первичный ключ, категориальные атрибуты (в том числе даты)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n"
]
},
{
"data": {
"text/plain": [
"Entityset: orders\n",
" DataFrames:\n",
" customers [Rows: 100, Columns: 5]\n",
" sellers [Rows: 87, Columns: 4]\n",
" products [Rows: 100, Columns: 9]\n",
" orders [Rows: 100, Columns: 8]\n",
" order_items [Rows: 115, Columns: 8]\n",
" Relationships:\n",
" No relationships"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"es = ft.EntitySet(id=\"orders\")\n",
"\n",
"es = es.add_dataframe(\n",
" dataframe_name=\"customers\",\n",
" dataframe=customers,\n",
" index=\"customer_id\",\n",
" logical_types={\n",
" \"customer_unique_id\": Categorical,\n",
" \"customer_zip_code_prefix\": Categorical,\n",
" \"customer_city\": Categorical,\n",
" \"customer_state\": Categorical,\n",
" },\n",
")\n",
"es = es.add_dataframe(\n",
" dataframe_name=\"sellers\",\n",
" dataframe=sellers,\n",
" index=\"seller_id\",\n",
" logical_types={\n",
" \"seller_zip_code_prefix\": Categorical,\n",
" \"seller_city\": Categorical,\n",
" \"seller_state\": Categorical,\n",
" },\n",
")\n",
"es = es.add_dataframe(\n",
" dataframe_name=\"products\",\n",
" dataframe=products,\n",
" index=\"product_id\",\n",
" logical_types={\n",
" \"product_category_name\": Categorical,\n",
" \"product_name_lenght\": Categorical,\n",
" \"product_description_lenght\": Categorical,\n",
" \"product_photos_qty\": Categorical,\n",
" },\n",
")\n",
"es = es.add_dataframe(\n",
" dataframe_name=\"orders\",\n",
" dataframe=orders,\n",
" index=\"order_id\",\n",
" logical_types={\n",
" \"order_status\": Categorical,\n",
" \"order_purchase_timestamp\": Datetime,\n",
" \"order_approved_at\": Datetime,\n",
" \"order_delivered_carrier_date\": Datetime,\n",
" \"order_delivered_customer_date\": Datetime,\n",
" \"order_estimated_delivery_date\": Datetime,\n",
" },\n",
")\n",
"es = es.add_dataframe(\n",
" dataframe_name=\"order_items\",\n",
" dataframe=order_items,\n",
" index=\"orderitem_id\",\n",
" make_index=True,\n",
" logical_types={\"shipping_limit_date\": Datetime},\n",
")\n",
"\n",
"es"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Настройка связей между сущностями featuretools\n",
"\n",
"Настройка связей между таблицами на уровне ключей\n",
"\n",
"Связь указывается от родителя к потомкам (таблица-родитель, первичный ключ, таблица-потомок, внешний ключ)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Entityset: orders\n",
" DataFrames:\n",
" customers [Rows: 100, Columns: 5]\n",
" sellers [Rows: 87, Columns: 4]\n",
" products [Rows: 100, Columns: 9]\n",
" orders [Rows: 100, Columns: 8]\n",
" order_items [Rows: 115, Columns: 8]\n",
" Relationships:\n",
" orders.customer_id -> customers.customer_id\n",
" order_items.order_id -> orders.order_id\n",
" order_items.product_id -> products.product_id\n",
" order_items.seller_id -> sellers.seller_id"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"es = es.add_relationship(\"customers\", \"customer_id\", \"orders\", \"customer_id\")\n",
"es = es.add_relationship(\"orders\", \"order_id\", \"order_items\", \"order_id\")\n",
"es = es.add_relationship(\"products\", \"product_id\", \"order_items\", \"product_id\")\n",
"es = es.add_relationship(\"sellers\", \"seller_id\", \"order_items\", \"seller_id\")\n",
"\n",
"es"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Автоматическое конструирование признаков с помощью featuretools\n",
"\n",
"Библиотека применят различные функции агрегации и трансформации к атрибутам таблицы order_items с учетом отношений\n",
"\n",
"Результат помещается в Dataframe feature_matrix"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\featuretools\\synthesis\\dfs.py:321: UnusedPrimitiveWarning: Some specified primitives were not used during DFS:\n",
" agg_primitives: ['any', 'mode']\n",
"This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.\n",
" warnings.warn(warning_msg, UnusedPrimitiveWarning)\n",
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function mean at 0x00000245E1C73EC0> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
" ).agg(to_agg)\n",
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function mean at 0x00000245E1C73EC0> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
" ).agg(to_agg)\n",
"c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function mean at 0x00000245E1C73EC0> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
" ).agg(to_agg)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>order_item_id</th>\n",
" <th>price</th>\n",
" <th>freight_value</th>\n",
" <th>HOUR(shipping_limit_date)</th>\n",
" <th>WEEKDAY(shipping_limit_date)</th>\n",
" <th>orders.order_status</th>\n",
" <th>products.product_category_name</th>\n",
" <th>products.product_name_lenght</th>\n",
" <th>products.product_description_lenght</th>\n",
" <th>products.product_photos_qty</th>\n",
" <th>...</th>\n",
" <th>orders.customers.customer_city</th>\n",
" <th>orders.customers.customer_state</th>\n",
" <th>products.COUNT(order_items)</th>\n",
" <th>products.MEAN(order_items.freight_value)</th>\n",
" <th>products.MEAN(order_items.order_item_id)</th>\n",
" <th>products.MEAN(order_items.price)</th>\n",
" <th>sellers.COUNT(order_items)</th>\n",
" <th>sellers.MEAN(order_items.freight_value)</th>\n",
" <th>sellers.MEAN(order_items.order_item_id)</th>\n",
" <th>sellers.MEAN(order_items.price)</th>\n",
" </tr>\n",
" <tr>\n",
" <th>orderitem_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>38.50</td>\n",
" <td>24.84</td>\n",
" <td>20</td>\n",
" <td>4</td>\n",
" <td>delivered</td>\n",
" <td>cama_mesa_banho</td>\n",
" <td>53.0</td>\n",
" <td>223.0</td>\n",
" <td>1.0</td>\n",
" <td>...</td>\n",
" <td>santa luzia</td>\n",
" <td>PB</td>\n",
" <td>1</td>\n",
" <td>24.84</td>\n",
" <td>1.0</td>\n",
" <td>38.50</td>\n",
" <td>2</td>\n",
" <td>21.340</td>\n",
" <td>1.0</td>\n",
" <td>61.200000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>29.99</td>\n",
" <td>7.39</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" <td>delivered</td>\n",
" <td>telefonia</td>\n",
" <td>59.0</td>\n",
" <td>675.0</td>\n",
" <td>5.0</td>\n",
" <td>...</td>\n",
" <td>sao paulo</td>\n",
" <td>SP</td>\n",
" <td>1</td>\n",
" <td>7.39</td>\n",
" <td>1.0</td>\n",
" <td>29.99</td>\n",
" <td>1</td>\n",
" <td>7.390</td>\n",
" <td>1.0</td>\n",
" <td>29.990000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>110.99</td>\n",
" <td>21.27</td>\n",
" <td>21</td>\n",
" <td>1</td>\n",
" <td>delivered</td>\n",
" <td>cama_mesa_banho</td>\n",
" <td>52.0</td>\n",
" <td>413.0</td>\n",
" <td>1.0</td>\n",
" <td>...</td>\n",
" <td>gravatai</td>\n",
" <td>RS</td>\n",
" <td>1</td>\n",
" <td>21.27</td>\n",
" <td>1.0</td>\n",
" <td>110.99</td>\n",
" <td>1</td>\n",
" <td>21.270</td>\n",
" <td>1.0</td>\n",
" <td>110.990000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>27.99</td>\n",
" <td>15.10</td>\n",
" <td>23</td>\n",
" <td>1</td>\n",
" <td>delivered</td>\n",
" <td>telefonia</td>\n",
" <td>60.0</td>\n",
" <td>818.0</td>\n",
" <td>6.0</td>\n",
" <td>...</td>\n",
" <td>imbituba</td>\n",
" <td>SC</td>\n",
" <td>1</td>\n",
" <td>15.10</td>\n",
" <td>1.0</td>\n",
" <td>27.99</td>\n",
" <td>2</td>\n",
" <td>13.970</td>\n",
" <td>1.0</td>\n",
" <td>26.490000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>49.90</td>\n",
" <td>16.05</td>\n",
" <td>13</td>\n",
" <td>2</td>\n",
" <td>invoiced</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>santa rosa</td>\n",
" <td>RS</td>\n",
" <td>1</td>\n",
" <td>16.05</td>\n",
" <td>1.0</td>\n",
" <td>49.90</td>\n",
" <td>1</td>\n",
" <td>16.050</td>\n",
" <td>1.0</td>\n",
" <td>49.900000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110</th>\n",
" <td>1</td>\n",
" <td>17.90</td>\n",
" <td>10.96</td>\n",
" <td>8</td>\n",
" <td>1</td>\n",
" <td>delivered</td>\n",
" <td>cama_mesa_banho</td>\n",
" <td>55.0</td>\n",
" <td>122.0</td>\n",
" <td>1.0</td>\n",
" <td>...</td>\n",
" <td>jundiai</td>\n",
" <td>SP</td>\n",
" <td>1</td>\n",
" <td>10.96</td>\n",
" <td>1.0</td>\n",
" <td>17.90</td>\n",
" <td>1</td>\n",
" <td>10.960</td>\n",
" <td>1.0</td>\n",
" <td>17.900000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>111</th>\n",
" <td>1</td>\n",
" <td>79.99</td>\n",
" <td>8.91</td>\n",
" <td>9</td>\n",
" <td>4</td>\n",
" <td>delivered</td>\n",
" <td>beleza_saude</td>\n",
" <td>59.0</td>\n",
" <td>492.0</td>\n",
" <td>3.0</td>\n",
" <td>...</td>\n",
" <td>sao paulo</td>\n",
" <td>SP</td>\n",
" <td>1</td>\n",
" <td>8.91</td>\n",
" <td>1.0</td>\n",
" <td>79.99</td>\n",
" <td>5</td>\n",
" <td>13.206</td>\n",
" <td>1.2</td>\n",
" <td>54.590000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>112</th>\n",
" <td>1</td>\n",
" <td>190.00</td>\n",
" <td>19.41</td>\n",
" <td>13</td>\n",
" <td>3</td>\n",
" <td>delivered</td>\n",
" <td>climatizacao</td>\n",
" <td>60.0</td>\n",
" <td>3270.0</td>\n",
" <td>4.0</td>\n",
" <td>...</td>\n",
" <td>paulinia</td>\n",
" <td>SP</td>\n",
" <td>1</td>\n",
" <td>19.41</td>\n",
" <td>1.0</td>\n",
" <td>190.00</td>\n",
" <td>1</td>\n",
" <td>19.410</td>\n",
" <td>1.0</td>\n",
" <td>190.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>113</th>\n",
" <td>1</td>\n",
" <td>109.90</td>\n",
" <td>15.53</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>delivered</td>\n",
" <td>cool_stuff</td>\n",
" <td>46.0</td>\n",
" <td>595.0</td>\n",
" <td>2.0</td>\n",
" <td>...</td>\n",
" <td>rio de janeiro</td>\n",
" <td>RJ</td>\n",
" <td>1</td>\n",
" <td>15.53</td>\n",
" <td>1.0</td>\n",
" <td>109.90</td>\n",
" <td>1</td>\n",
" <td>15.530</td>\n",
" <td>1.0</td>\n",
" <td>109.900000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>114</th>\n",
" <td>1</td>\n",
" <td>27.90</td>\n",
" <td>18.30</td>\n",
" <td>14</td>\n",
" <td>2</td>\n",
" <td>delivered</td>\n",
" <td>alimentos</td>\n",
" <td>59.0</td>\n",
" <td>982.0</td>\n",
" <td>1.0</td>\n",
" <td>...</td>\n",
" <td>joinville</td>\n",
" <td>SC</td>\n",
" <td>2</td>\n",
" <td>16.70</td>\n",
" <td>1.0</td>\n",
" <td>27.90</td>\n",
" <td>3</td>\n",
" <td>16.190</td>\n",
" <td>1.0</td>\n",
" <td>38.596667</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>115 rows × 43 columns</p>\n",
"</div>"
],
"text/plain": [
" order_item_id price freight_value HOUR(shipping_limit_date) \\\n",
"orderitem_id \n",
"0 1 38.50 24.84 20 \n",
"1 1 29.99 7.39 8 \n",
"2 1 110.99 21.27 21 \n",
"3 1 27.99 15.10 23 \n",
"4 1 49.90 16.05 13 \n",
"... ... ... ... ... \n",
"110 1 17.90 10.96 8 \n",
"111 1 79.99 8.91 9 \n",
"112 1 190.00 19.41 13 \n",
"113 1 109.90 15.53 2 \n",
"114 1 27.90 18.30 14 \n",
"\n",
" WEEKDAY(shipping_limit_date) orders.order_status \\\n",
"orderitem_id \n",
"0 4 delivered \n",
"1 0 delivered \n",
"2 1 delivered \n",
"3 1 delivered \n",
"4 2 invoiced \n",
"... ... ... \n",
"110 1 delivered \n",
"111 4 delivered \n",
"112 3 delivered \n",
"113 2 delivered \n",
"114 2 delivered \n",
"\n",
" products.product_category_name products.product_name_lenght \\\n",
"orderitem_id \n",
"0 cama_mesa_banho 53.0 \n",
"1 telefonia 59.0 \n",
"2 cama_mesa_banho 52.0 \n",
"3 telefonia 60.0 \n",
"4 NaN NaN \n",
"... ... ... \n",
"110 cama_mesa_banho 55.0 \n",
"111 beleza_saude 59.0 \n",
"112 climatizacao 60.0 \n",
"113 cool_stuff 46.0 \n",
"114 alimentos 59.0 \n",
"\n",
" products.product_description_lenght products.product_photos_qty \\\n",
"orderitem_id \n",
"0 223.0 1.0 \n",
"1 675.0 5.0 \n",
"2 413.0 1.0 \n",
"3 818.0 6.0 \n",
"4 NaN NaN \n",
"... ... ... \n",
"110 122.0 1.0 \n",
"111 492.0 3.0 \n",
"112 3270.0 4.0 \n",
"113 595.0 2.0 \n",
"114 982.0 1.0 \n",
"\n",
" ... orders.customers.customer_city \\\n",
"orderitem_id ... \n",
"0 ... santa luzia \n",
"1 ... sao paulo \n",
"2 ... gravatai \n",
"3 ... imbituba \n",
"4 ... santa rosa \n",
"... ... ... \n",
"110 ... jundiai \n",
"111 ... sao paulo \n",
"112 ... paulinia \n",
"113 ... rio de janeiro \n",
"114 ... joinville \n",
"\n",
" orders.customers.customer_state products.COUNT(order_items) \\\n",
"orderitem_id \n",
"0 PB 1 \n",
"1 SP 1 \n",
"2 RS 1 \n",
"3 SC 1 \n",
"4 RS 1 \n",
"... ... ... \n",
"110 SP 1 \n",
"111 SP 1 \n",
"112 SP 1 \n",
"113 RJ 1 \n",
"114 SC 2 \n",
"\n",
" products.MEAN(order_items.freight_value) \\\n",
"orderitem_id \n",
"0 24.84 \n",
"1 7.39 \n",
"2 21.27 \n",
"3 15.10 \n",
"4 16.05 \n",
"... ... \n",
"110 10.96 \n",
"111 8.91 \n",
"112 19.41 \n",
"113 15.53 \n",
"114 16.70 \n",
"\n",
" products.MEAN(order_items.order_item_id) \\\n",
"orderitem_id \n",
"0 1.0 \n",
"1 1.0 \n",
"2 1.0 \n",
"3 1.0 \n",
"4 1.0 \n",
"... ... \n",
"110 1.0 \n",
"111 1.0 \n",
"112 1.0 \n",
"113 1.0 \n",
"114 1.0 \n",
"\n",
" products.MEAN(order_items.price) sellers.COUNT(order_items) \\\n",
"orderitem_id \n",
"0 38.50 2 \n",
"1 29.99 1 \n",
"2 110.99 1 \n",
"3 27.99 2 \n",
"4 49.90 1 \n",
"... ... ... \n",
"110 17.90 1 \n",
"111 79.99 5 \n",
"112 190.00 1 \n",
"113 109.90 1 \n",
"114 27.90 3 \n",
"\n",
" sellers.MEAN(order_items.freight_value) \\\n",
"orderitem_id \n",
"0 21.340 \n",
"1 7.390 \n",
"2 21.270 \n",
"3 13.970 \n",
"4 16.050 \n",
"... ... \n",
"110 10.960 \n",
"111 13.206 \n",
"112 19.410 \n",
"113 15.530 \n",
"114 16.190 \n",
"\n",
" sellers.MEAN(order_items.order_item_id) \\\n",
"orderitem_id \n",
"0 1.0 \n",
"1 1.0 \n",
"2 1.0 \n",
"3 1.0 \n",
"4 1.0 \n",
"... ... \n",
"110 1.0 \n",
"111 1.2 \n",
"112 1.0 \n",
"113 1.0 \n",
"114 1.0 \n",
"\n",
" sellers.MEAN(order_items.price) \n",
"orderitem_id \n",
"0 61.200000 \n",
"1 29.990000 \n",
"2 110.990000 \n",
"3 26.490000 \n",
"4 49.900000 \n",
"... ... \n",
"110 17.900000 \n",
"111 54.590000 \n",
"112 190.000000 \n",
"113 109.900000 \n",
"114 38.596667 \n",
"\n",
"[115 rows x 43 columns]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_matrix, feature_defs = ft.dfs(\n",
" entityset=es,\n",
" target_dataframe_name=\"order_items\",\n",
" agg_primitives=[\"mean\", \"count\", \"mode\", \"any\"],\n",
" trans_primitives=[\"hour\", \"weekday\"],\n",
" max_depth=2,\n",
")\n",
"\n",
"feature_matrix"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Полученные признаки\n",
"\n",
"Список колонок полученного dataframe'а"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<Feature: order_item_id>,\n",
" <Feature: price>,\n",
" <Feature: freight_value>,\n",
" <Feature: HOUR(shipping_limit_date)>,\n",
" <Feature: WEEKDAY(shipping_limit_date)>,\n",
" <Feature: orders.order_status>,\n",
" <Feature: products.product_category_name>,\n",
" <Feature: products.product_name_lenght>,\n",
" <Feature: products.product_description_lenght>,\n",
" <Feature: products.product_photos_qty>,\n",
" <Feature: products.product_weight_g>,\n",
" <Feature: products.product_length_cm>,\n",
" <Feature: products.product_height_cm>,\n",
" <Feature: products.product_width_cm>,\n",
" <Feature: sellers.seller_zip_code_prefix>,\n",
" <Feature: sellers.seller_city>,\n",
" <Feature: sellers.seller_state>,\n",
" <Feature: orders.COUNT(order_items)>,\n",
" <Feature: orders.MEAN(order_items.freight_value)>,\n",
" <Feature: orders.MEAN(order_items.order_item_id)>,\n",
" <Feature: orders.MEAN(order_items.price)>,\n",
" <Feature: orders.HOUR(order_approved_at)>,\n",
" <Feature: orders.HOUR(order_delivered_carrier_date)>,\n",
" <Feature: orders.HOUR(order_delivered_customer_date)>,\n",
" <Feature: orders.HOUR(order_estimated_delivery_date)>,\n",
" <Feature: orders.HOUR(order_purchase_timestamp)>,\n",
" <Feature: orders.WEEKDAY(order_approved_at)>,\n",
" <Feature: orders.WEEKDAY(order_delivered_carrier_date)>,\n",
" <Feature: orders.WEEKDAY(order_delivered_customer_date)>,\n",
" <Feature: orders.WEEKDAY(order_estimated_delivery_date)>,\n",
" <Feature: orders.WEEKDAY(order_purchase_timestamp)>,\n",
" <Feature: orders.customers.customer_unique_id>,\n",
" <Feature: orders.customers.customer_zip_code_prefix>,\n",
" <Feature: orders.customers.customer_city>,\n",
" <Feature: orders.customers.customer_state>,\n",
" <Feature: products.COUNT(order_items)>,\n",
" <Feature: products.MEAN(order_items.freight_value)>,\n",
" <Feature: products.MEAN(order_items.order_item_id)>,\n",
" <Feature: products.MEAN(order_items.price)>,\n",
" <Feature: sellers.COUNT(order_items)>,\n",
" <Feature: sellers.MEAN(order_items.freight_value)>,\n",
" <Feature: sellers.MEAN(order_items.order_item_id)>,\n",
" <Feature: sellers.MEAN(order_items.price)>]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_defs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Отсечение значений признаков"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Определение выбросов с помощью boxplot"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Axes: >"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"titanic.boxplot(column=\"Age\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Отсечение данных для признака Возраст, значение которых больше 65 лет"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Age</th>\n",
" <th>AgeClip</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>Wheadon, Mr. Edward H</td>\n",
" <td>66.0</td>\n",
" <td>65.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>Goldschmidt, Mr. George B</td>\n",
" <td>71.0</td>\n",
" <td>65.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>117</th>\n",
" <td>Connors, Mr. Patrick</td>\n",
" <td>70.5</td>\n",
" <td>65.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>494</th>\n",
" <td>Artagaveytia, Mr. Ramon</td>\n",
" <td>71.0</td>\n",
" <td>65.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>631</th>\n",
" <td>Barkworth, Mr. Algernon Henry Wilson</td>\n",
" <td>80.0</td>\n",
" <td>65.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>673</th>\n",
" <td>Mitchell, Mr. Henry Michael</td>\n",
" <td>70.0</td>\n",
" <td>65.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>746</th>\n",
" <td>Crosby, Capt. Edward Gifford</td>\n",
" <td>70.0</td>\n",
" <td>65.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>852</th>\n",
" <td>Svensson, Mr. Johan</td>\n",
" <td>74.0</td>\n",
" <td>65.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name Age AgeClip\n",
"34 Wheadon, Mr. Edward H 66.0 65.0\n",
"97 Goldschmidt, Mr. George B 71.0 65.0\n",
"117 Connors, Mr. Patrick 70.5 65.0\n",
"494 Artagaveytia, Mr. Ramon 71.0 65.0\n",
"631 Barkworth, Mr. Algernon Henry Wilson 80.0 65.0\n",
"673 Mitchell, Mr. Henry Michael 70.0 65.0\n",
"746 Crosby, Capt. Edward Gifford 70.0 65.0\n",
"852 Svensson, Mr. Johan 74.0 65.0"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"titanic_norm = titanic.copy()\n",
"\n",
"titanic_norm[\"AgeClip\"] = titanic[\"Age\"].clip(0, 65);\n",
"\n",
"titanic_norm[titanic_norm[\"Age\"] > 65][[\"Name\", \"Age\", \"AgeClip\"]]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Винсоризация признака Возраст"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"56.0\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Age</th>\n",
" <th>AgeWinsorize</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>Wheadon, Mr. Edward H</td>\n",
" <td>66.0</td>\n",
" <td>54.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>Goldschmidt, Mr. George B</td>\n",
" <td>71.0</td>\n",
" <td>54.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>117</th>\n",
" <td>Connors, Mr. Patrick</td>\n",
" <td>70.5</td>\n",
" <td>54.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>494</th>\n",
" <td>Artagaveytia, Mr. Ramon</td>\n",
" <td>71.0</td>\n",
" <td>54.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>631</th>\n",
" <td>Barkworth, Mr. Algernon Henry Wilson</td>\n",
" <td>80.0</td>\n",
" <td>54.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>673</th>\n",
" <td>Mitchell, Mr. Henry Michael</td>\n",
" <td>70.0</td>\n",
" <td>54.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>746</th>\n",
" <td>Crosby, Capt. Edward Gifford</td>\n",
" <td>70.0</td>\n",
" <td>54.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>852</th>\n",
" <td>Svensson, Mr. Johan</td>\n",
" <td>74.0</td>\n",
" <td>54.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name Age AgeWinsorize\n",
"34 Wheadon, Mr. Edward H 66.0 54.0\n",
"97 Goldschmidt, Mr. George B 71.0 54.0\n",
"117 Connors, Mr. Patrick 70.5 54.0\n",
"494 Artagaveytia, Mr. Ramon 71.0 54.0\n",
"631 Barkworth, Mr. Algernon Henry Wilson 80.0 54.0\n",
"673 Mitchell, Mr. Henry Michael 70.0 54.0\n",
"746 Crosby, Capt. Edward Gifford 70.0 54.0\n",
"852 Svensson, Mr. Johan 74.0 54.0"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from scipy.stats.mstats import winsorize\n",
"\n",
"print(titanic_norm[\"Age\"].quantile(q=0.95))\n",
"\n",
"titanic_norm[\"AgeWinsorize\"] = winsorize(\n",
" titanic_norm[\"Age\"].fillna(titanic_norm[\"Age\"].mean()), (0, 0.05), inplace=False\n",
")\n",
"\n",
"titanic_norm[titanic_norm[\"Age\"] > 65][[\"Name\", \"Age\", \"AgeWinsorize\"]]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Нормализация значений"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Age</th>\n",
" <th>AgeNorm</th>\n",
" <th>AgeClipNorm</th>\n",
" <th>AgeWinsorizeNorm</th>\n",
" <th>AgeWinsorizeNorm2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Braund, Mr. Owen Harris</td>\n",
" <td>22.0</td>\n",
" <td>0.271174</td>\n",
" <td>0.334159</td>\n",
" <td>0.402762</td>\n",
" <td>-0.194476</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
" <td>38.0</td>\n",
" <td>0.472229</td>\n",
" <td>0.581914</td>\n",
" <td>0.701381</td>\n",
" <td>0.402762</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Heikkinen, Miss. Laina</td>\n",
" <td>26.0</td>\n",
" <td>0.321438</td>\n",
" <td>0.396098</td>\n",
" <td>0.477417</td>\n",
" <td>-0.045166</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
" <td>35.0</td>\n",
" <td>0.434531</td>\n",
" <td>0.535460</td>\n",
" <td>0.645390</td>\n",
" <td>0.290780</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Allen, Mr. William Henry</td>\n",
" <td>35.0</td>\n",
" <td>0.434531</td>\n",
" <td>0.535460</td>\n",
" <td>0.645390</td>\n",
" <td>0.290780</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Moran, Mr. James</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.546456</td>\n",
" <td>0.092912</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>McCarthy, Mr. Timothy J</td>\n",
" <td>54.0</td>\n",
" <td>0.673285</td>\n",
" <td>0.829669</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Palsson, Master. Gosta Leonard</td>\n",
" <td>2.0</td>\n",
" <td>0.019854</td>\n",
" <td>0.024466</td>\n",
" <td>0.029489</td>\n",
" <td>-0.941023</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)</td>\n",
" <td>27.0</td>\n",
" <td>0.334004</td>\n",
" <td>0.411583</td>\n",
" <td>0.496081</td>\n",
" <td>-0.007839</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Nasser, Mrs. Nicholas (Adele Achem)</td>\n",
" <td>14.0</td>\n",
" <td>0.170646</td>\n",
" <td>0.210282</td>\n",
" <td>0.253453</td>\n",
" <td>-0.493094</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Sandstrom, Miss. Marguerite Rut</td>\n",
" <td>4.0</td>\n",
" <td>0.044986</td>\n",
" <td>0.055435</td>\n",
" <td>0.066816</td>\n",
" <td>-0.866368</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Bonnell, Miss. Elizabeth</td>\n",
" <td>58.0</td>\n",
" <td>0.723549</td>\n",
" <td>0.891607</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Saundercock, Mr. William Henry</td>\n",
" <td>20.0</td>\n",
" <td>0.246042</td>\n",
" <td>0.303190</td>\n",
" <td>0.365435</td>\n",
" <td>-0.269130</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Andersson, Mr. Anders Johan</td>\n",
" <td>39.0</td>\n",
" <td>0.484795</td>\n",
" <td>0.597399</td>\n",
" <td>0.720045</td>\n",
" <td>0.440090</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Vestrom, Miss. Hulda Amanda Adolfina</td>\n",
" <td>14.0</td>\n",
" <td>0.170646</td>\n",
" <td>0.210282</td>\n",
" <td>0.253453</td>\n",
" <td>-0.493094</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Hewlett, Mrs. (Mary D Kingcome)</td>\n",
" <td>55.0</td>\n",
" <td>0.685851</td>\n",
" <td>0.845153</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Rice, Master. Eugene</td>\n",
" <td>2.0</td>\n",
" <td>0.019854</td>\n",
" <td>0.024466</td>\n",
" <td>0.029489</td>\n",
" <td>-0.941023</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>Williams, Mr. Charles Eugene</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.546456</td>\n",
" <td>0.092912</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>Vander Planke, Mrs. Julius (Emelia Maria Vande...</td>\n",
" <td>31.0</td>\n",
" <td>0.384267</td>\n",
" <td>0.473521</td>\n",
" <td>0.570735</td>\n",
" <td>0.141471</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>Masselmani, Mrs. Fatima</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.546456</td>\n",
" <td>0.092912</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name Age AgeNorm \\\n",
"1 Braund, Mr. Owen Harris 22.0 0.271174 \n",
"2 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 0.472229 \n",
"3 Heikkinen, Miss. Laina 26.0 0.321438 \n",
"4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 0.434531 \n",
"5 Allen, Mr. William Henry 35.0 0.434531 \n",
"6 Moran, Mr. James NaN NaN \n",
"7 McCarthy, Mr. Timothy J 54.0 0.673285 \n",
"8 Palsson, Master. Gosta Leonard 2.0 0.019854 \n",
"9 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 27.0 0.334004 \n",
"10 Nasser, Mrs. Nicholas (Adele Achem) 14.0 0.170646 \n",
"11 Sandstrom, Miss. Marguerite Rut 4.0 0.044986 \n",
"12 Bonnell, Miss. Elizabeth 58.0 0.723549 \n",
"13 Saundercock, Mr. William Henry 20.0 0.246042 \n",
"14 Andersson, Mr. Anders Johan 39.0 0.484795 \n",
"15 Vestrom, Miss. Hulda Amanda Adolfina 14.0 0.170646 \n",
"16 Hewlett, Mrs. (Mary D Kingcome) 55.0 0.685851 \n",
"17 Rice, Master. Eugene 2.0 0.019854 \n",
"18 Williams, Mr. Charles Eugene NaN NaN \n",
"19 Vander Planke, Mrs. Julius (Emelia Maria Vande... 31.0 0.384267 \n",
"20 Masselmani, Mrs. Fatima NaN NaN \n",
"\n",
" AgeClipNorm AgeWinsorizeNorm AgeWinsorizeNorm2 \n",
"1 0.334159 0.402762 -0.194476 \n",
"2 0.581914 0.701381 0.402762 \n",
"3 0.396098 0.477417 -0.045166 \n",
"4 0.535460 0.645390 0.290780 \n",
"5 0.535460 0.645390 0.290780 \n",
"6 NaN 0.546456 0.092912 \n",
"7 0.829669 1.000000 1.000000 \n",
"8 0.024466 0.029489 -0.941023 \n",
"9 0.411583 0.496081 -0.007839 \n",
"10 0.210282 0.253453 -0.493094 \n",
"11 0.055435 0.066816 -0.866368 \n",
"12 0.891607 1.000000 1.000000 \n",
"13 0.303190 0.365435 -0.269130 \n",
"14 0.597399 0.720045 0.440090 \n",
"15 0.210282 0.253453 -0.493094 \n",
"16 0.845153 1.000000 1.000000 \n",
"17 0.024466 0.029489 -0.941023 \n",
"18 NaN 0.546456 0.092912 \n",
"19 0.473521 0.570735 0.141471 \n",
"20 NaN 0.546456 0.092912 "
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn import preprocessing\n",
"\n",
"min_max_scaler = preprocessing.MinMaxScaler()\n",
"\n",
"min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))\n",
"\n",
"titanic_norm[\"AgeNorm\"] = min_max_scaler.fit_transform(\n",
" titanic_norm[\"Age\"].to_numpy().reshape(-1, 1)\n",
").reshape(titanic_norm[\"Age\"].shape)\n",
"\n",
"titanic_norm[\"AgeClipNorm\"] = min_max_scaler.fit_transform(\n",
" titanic_norm[\"AgeClip\"].to_numpy().reshape(-1, 1)\n",
").reshape(titanic_norm[\"Age\"].shape)\n",
"\n",
"titanic_norm[\"AgeWinsorizeNorm\"] = min_max_scaler.fit_transform(\n",
" titanic_norm[\"AgeWinsorize\"].to_numpy().reshape(-1, 1)\n",
").reshape(titanic_norm[\"Age\"].shape)\n",
"\n",
"titanic_norm[\"AgeWinsorizeNorm2\"] = min_max_scaler_2.fit_transform(\n",
" titanic_norm[\"AgeWinsorize\"].to_numpy().reshape(-1, 1)\n",
").reshape(titanic_norm[\"Age\"].shape)\n",
"\n",
"titanic_norm[\n",
" [\"Name\", \"Age\", \"AgeNorm\", \"AgeClipNorm\", \"AgeWinsorizeNorm\", \"AgeWinsorizeNorm2\"]\n",
"].head(20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Стандартизация значений"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Age</th>\n",
" <th>AgeStand</th>\n",
" <th>AgeClipStand</th>\n",
" <th>AgeWinsorizeStand</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Braund, Mr. Owen Harris</td>\n",
" <td>22.0</td>\n",
" <td>-0.530377</td>\n",
" <td>-0.532745</td>\n",
" <td>-0.606602</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
" <td>38.0</td>\n",
" <td>0.571831</td>\n",
" <td>0.585060</td>\n",
" <td>0.718863</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Heikkinen, Miss. Laina</td>\n",
" <td>26.0</td>\n",
" <td>-0.254825</td>\n",
" <td>-0.253294</td>\n",
" <td>-0.275236</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
" <td>35.0</td>\n",
" <td>0.365167</td>\n",
" <td>0.375472</td>\n",
" <td>0.470339</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Allen, Mr. William Henry</td>\n",
" <td>35.0</td>\n",
" <td>0.365167</td>\n",
" <td>0.375472</td>\n",
" <td>0.470339</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Moran, Mr. James</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.031205</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>McCarthy, Mr. Timothy J</td>\n",
" <td>54.0</td>\n",
" <td>1.674039</td>\n",
" <td>1.702866</td>\n",
" <td>2.044329</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Palsson, Master. Gosta Leonard</td>\n",
" <td>2.0</td>\n",
" <td>-1.908136</td>\n",
" <td>-1.930003</td>\n",
" <td>-2.263435</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)</td>\n",
" <td>27.0</td>\n",
" <td>-0.185937</td>\n",
" <td>-0.183431</td>\n",
" <td>-0.192394</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Nasser, Mrs. Nicholas (Adele Achem)</td>\n",
" <td>14.0</td>\n",
" <td>-1.081480</td>\n",
" <td>-1.091648</td>\n",
" <td>-1.269335</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Sandstrom, Miss. Marguerite Rut</td>\n",
" <td>4.0</td>\n",
" <td>-1.770360</td>\n",
" <td>-1.790277</td>\n",
" <td>-2.097751</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Bonnell, Miss. Elizabeth</td>\n",
" <td>58.0</td>\n",
" <td>1.949591</td>\n",
" <td>1.982317</td>\n",
" <td>2.044329</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Saundercock, Mr. William Henry</td>\n",
" <td>20.0</td>\n",
" <td>-0.668153</td>\n",
" <td>-0.672471</td>\n",
" <td>-0.772286</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Andersson, Mr. Anders Johan</td>\n",
" <td>39.0</td>\n",
" <td>0.640719</td>\n",
" <td>0.654923</td>\n",
" <td>0.801705</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Vestrom, Miss. Hulda Amanda Adolfina</td>\n",
" <td>14.0</td>\n",
" <td>-1.081480</td>\n",
" <td>-1.091648</td>\n",
" <td>-1.269335</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Hewlett, Mrs. (Mary D Kingcome)</td>\n",
" <td>55.0</td>\n",
" <td>1.742927</td>\n",
" <td>1.772729</td>\n",
" <td>2.044329</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Rice, Master. Eugene</td>\n",
" <td>2.0</td>\n",
" <td>-1.908136</td>\n",
" <td>-1.930003</td>\n",
" <td>-2.263435</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>Williams, Mr. Charles Eugene</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.031205</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>Vander Planke, Mrs. Julius (Emelia Maria Vande...</td>\n",
" <td>31.0</td>\n",
" <td>0.089615</td>\n",
" <td>0.096020</td>\n",
" <td>0.138972</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>Masselmani, Mrs. Fatima</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.031205</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name Age AgeStand \\\n",
"1 Braund, Mr. Owen Harris 22.0 -0.530377 \n",
"2 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 0.571831 \n",
"3 Heikkinen, Miss. Laina 26.0 -0.254825 \n",
"4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 0.365167 \n",
"5 Allen, Mr. William Henry 35.0 0.365167 \n",
"6 Moran, Mr. James NaN NaN \n",
"7 McCarthy, Mr. Timothy J 54.0 1.674039 \n",
"8 Palsson, Master. Gosta Leonard 2.0 -1.908136 \n",
"9 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 27.0 -0.185937 \n",
"10 Nasser, Mrs. Nicholas (Adele Achem) 14.0 -1.081480 \n",
"11 Sandstrom, Miss. Marguerite Rut 4.0 -1.770360 \n",
"12 Bonnell, Miss. Elizabeth 58.0 1.949591 \n",
"13 Saundercock, Mr. William Henry 20.0 -0.668153 \n",
"14 Andersson, Mr. Anders Johan 39.0 0.640719 \n",
"15 Vestrom, Miss. Hulda Amanda Adolfina 14.0 -1.081480 \n",
"16 Hewlett, Mrs. (Mary D Kingcome) 55.0 1.742927 \n",
"17 Rice, Master. Eugene 2.0 -1.908136 \n",
"18 Williams, Mr. Charles Eugene NaN NaN \n",
"19 Vander Planke, Mrs. Julius (Emelia Maria Vande... 31.0 0.089615 \n",
"20 Masselmani, Mrs. Fatima NaN NaN \n",
"\n",
" AgeClipStand AgeWinsorizeStand \n",
"1 -0.532745 -0.606602 \n",
"2 0.585060 0.718863 \n",
"3 -0.253294 -0.275236 \n",
"4 0.375472 0.470339 \n",
"5 0.375472 0.470339 \n",
"6 NaN 0.031205 \n",
"7 1.702866 2.044329 \n",
"8 -1.930003 -2.263435 \n",
"9 -0.183431 -0.192394 \n",
"10 -1.091648 -1.269335 \n",
"11 -1.790277 -2.097751 \n",
"12 1.982317 2.044329 \n",
"13 -0.672471 -0.772286 \n",
"14 0.654923 0.801705 \n",
"15 -1.091648 -1.269335 \n",
"16 1.772729 2.044329 \n",
"17 -1.930003 -2.263435 \n",
"18 NaN 0.031205 \n",
"19 0.096020 0.138972 \n",
"20 NaN 0.031205 "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn import preprocessing\n",
"\n",
"stndart_scaler = preprocessing.StandardScaler()\n",
"\n",
"titanic_norm[\"AgeStand\"] = stndart_scaler.fit_transform(\n",
" titanic_norm[\"Age\"].to_numpy().reshape(-1, 1)\n",
").reshape(titanic_norm[\"Age\"].shape)\n",
"\n",
"titanic_norm[\"AgeClipStand\"] = stndart_scaler.fit_transform(\n",
" titanic_norm[\"AgeClip\"].to_numpy().reshape(-1, 1)\n",
").reshape(titanic_norm[\"Age\"].shape)\n",
"\n",
"titanic_norm[\"AgeWinsorizeStand\"] = stndart_scaler.fit_transform(\n",
" titanic_norm[\"AgeWinsorize\"].to_numpy().reshape(-1, 1)\n",
").reshape(titanic_norm[\"Age\"].shape)\n",
"\n",
"titanic_norm[[\"Name\", \"Age\", \"AgeStand\", \"AgeClipStand\", \"AgeWinsorizeStand\"]].head(20)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}