{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Загрузка набора данных Titanic" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
PassengerId
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
503Allen, Mr. William Henrymale35.0003734508.0500NaNS
....................................
88702Montvila, Rev. Juozasmale27.00021153613.0000NaNS
88811Graham, Miss. Margaret Edithfemale19.00011205330.0000B42S
88903Johnston, Miss. Catherine Helen \"Carrie\"femaleNaN12W./C. 660723.4500NaNS
89011Behr, Mr. Karl Howellmale26.00011136930.0000C148C
89103Dooley, Mr. Patrickmale32.0003703767.7500NaNQ
\n", "

891 rows × 11 columns

\n", "
" ], "text/plain": [ " Survived Pclass \\\n", "PassengerId \n", "1 0 3 \n", "2 1 1 \n", "3 1 3 \n", "4 1 1 \n", "5 0 3 \n", "... ... ... \n", "887 0 2 \n", "888 1 1 \n", "889 0 3 \n", "890 1 1 \n", "891 0 3 \n", "\n", " Name Sex Age \\\n", "PassengerId \n", "1 Braund, Mr. Owen Harris male 22.0 \n", "2 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 \n", "3 Heikkinen, Miss. Laina female 26.0 \n", "4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 \n", "5 Allen, Mr. William Henry male 35.0 \n", "... ... ... ... \n", "887 Montvila, Rev. Juozas male 27.0 \n", "888 Graham, Miss. Margaret Edith female 19.0 \n", "889 Johnston, Miss. Catherine Helen \"Carrie\" female NaN \n", "890 Behr, Mr. Karl Howell male 26.0 \n", "891 Dooley, Mr. Patrick male 32.0 \n", "\n", " SibSp Parch Ticket Fare Cabin Embarked \n", "PassengerId \n", "1 1 0 A/5 21171 7.2500 NaN S \n", "2 1 0 PC 17599 71.2833 C85 C \n", "3 0 0 STON/O2. 3101282 7.9250 NaN S \n", "4 1 0 113803 53.1000 C123 S \n", "5 0 0 373450 8.0500 NaN S \n", "... ... ... ... ... ... ... \n", "887 0 0 211536 13.0000 NaN S \n", "888 0 0 112053 30.0000 B42 S \n", "889 1 2 W./C. 6607 23.4500 NaN S \n", "890 0 0 111369 30.0000 C148 C \n", "891 0 0 370376 7.7500 NaN Q \n", "\n", "[891 rows x 11 columns]" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "titanic = pd.read_csv(\"data/titanic.csv\", index_col=\"PassengerId\")\n", "\n", "titanic" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Унитарное кодирование\n", "\n", "Преобразование категориального признака в несколько бинарных признаков" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Унитарное кодирование признаков Пол (Sex) и Порт посадки (Embarked)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Кодирование" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Embarked_QEmbarked_SEmbarked_nanSex_male
00.01.00.01.0
10.00.00.00.0
20.01.00.00.0
30.01.00.00.0
40.01.00.01.0
...............
8860.01.00.01.0
8870.01.00.00.0
8880.01.00.00.0
8890.00.00.01.0
8901.00.00.01.0
\n", "

891 rows × 4 columns

\n", "
" ], "text/plain": [ " Embarked_Q Embarked_S Embarked_nan Sex_male\n", "0 0.0 1.0 0.0 1.0\n", "1 0.0 0.0 0.0 0.0\n", "2 0.0 1.0 0.0 0.0\n", "3 0.0 1.0 0.0 0.0\n", "4 0.0 1.0 0.0 1.0\n", ".. ... ... ... ...\n", "886 0.0 1.0 0.0 1.0\n", "887 0.0 1.0 0.0 0.0\n", "888 0.0 1.0 0.0 0.0\n", "889 0.0 0.0 0.0 1.0\n", "890 1.0 0.0 0.0 1.0\n", "\n", "[891 rows x 4 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.preprocessing import OneHotEncoder\n", "import numpy as np\n", "\n", "encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n", "\n", "encoded_values = encoder.fit_transform(titanic[[\"Embarked\", \"Sex\"]])\n", "\n", "encoded_columns = encoder.get_feature_names_out([\"Embarked\", \"Sex\"])\n", "\n", "encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n", "\n", "encoded_values_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Добавление признаков в исходный Dataframe" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedEmbarked_QEmbarked_SEmbarked_nanSex_male
10.03.0Braund, Mr. Owen Harrismale22.01.00.0A/5 211717.2500NaNS0.00.00.00.0
21.01.0Cumings, Mrs. John Bradley (Florence Briggs Th...female38.01.00.0PC 1759971.2833C85C0.01.00.00.0
31.03.0Heikkinen, Miss. Lainafemale26.00.00.0STON/O2. 31012827.9250NaNS0.01.00.00.0
41.01.0Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01.00.011380353.1000C123S0.01.00.01.0
50.03.0Allen, Mr. William Henrymale35.00.00.03734508.0500NaNS1.00.00.01.0
................................................
8881.01.0Graham, Miss. Margaret Edithfemale19.00.00.011205330.0000B42S0.01.00.00.0
8890.03.0Johnston, Miss. Catherine Helen \"Carrie\"femaleNaN1.02.0W./C. 660723.4500NaNS0.00.00.01.0
8901.01.0Behr, Mr. Karl Howellmale26.00.00.011136930.0000C148C1.00.00.01.0
8910.03.0Dooley, Mr. Patrickmale32.00.00.03703767.7500NaNQNaNNaNNaNNaN
0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0.01.00.01.0
\n", "

892 rows × 15 columns

\n", "
" ], "text/plain": [ " Survived Pclass Name \\\n", "1 0.0 3.0 Braund, Mr. Owen Harris \n", "2 1.0 1.0 Cumings, Mrs. John Bradley (Florence Briggs Th... \n", "3 1.0 3.0 Heikkinen, Miss. Laina \n", "4 1.0 1.0 Futrelle, Mrs. Jacques Heath (Lily May Peel) \n", "5 0.0 3.0 Allen, Mr. William Henry \n", ".. ... ... ... \n", "888 1.0 1.0 Graham, Miss. Margaret Edith \n", "889 0.0 3.0 Johnston, Miss. Catherine Helen \"Carrie\" \n", "890 1.0 1.0 Behr, Mr. Karl Howell \n", "891 0.0 3.0 Dooley, Mr. Patrick \n", "0 NaN NaN NaN \n", "\n", " Sex Age SibSp Parch Ticket Fare Cabin Embarked \\\n", "1 male 22.0 1.0 0.0 A/5 21171 7.2500 NaN S \n", "2 female 38.0 1.0 0.0 PC 17599 71.2833 C85 C \n", "3 female 26.0 0.0 0.0 STON/O2. 3101282 7.9250 NaN S \n", "4 female 35.0 1.0 0.0 113803 53.1000 C123 S \n", "5 male 35.0 0.0 0.0 373450 8.0500 NaN S \n", ".. ... ... ... ... ... ... ... ... \n", "888 female 19.0 0.0 0.0 112053 30.0000 B42 S \n", "889 female NaN 1.0 2.0 W./C. 6607 23.4500 NaN S \n", "890 male 26.0 0.0 0.0 111369 30.0000 C148 C \n", "891 male 32.0 0.0 0.0 370376 7.7500 NaN Q \n", "0 NaN NaN NaN NaN NaN NaN NaN NaN \n", "\n", " Embarked_Q Embarked_S Embarked_nan Sex_male \n", "1 0.0 0.0 0.0 0.0 \n", "2 0.0 1.0 0.0 0.0 \n", "3 0.0 1.0 0.0 0.0 \n", "4 0.0 1.0 0.0 1.0 \n", "5 1.0 0.0 0.0 1.0 \n", ".. ... ... ... ... \n", "888 0.0 1.0 0.0 0.0 \n", "889 0.0 0.0 0.0 1.0 \n", "890 1.0 0.0 0.0 1.0 \n", "891 NaN NaN NaN NaN \n", "0 0.0 1.0 0.0 1.0 \n", "\n", "[892 rows x 15 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic = pd.concat([titanic, encoded_values_df], axis=1)\n", "\n", "titanic" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Дискретизация признаков" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Равномерное разделение данных на 3 группы" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "labels = [\"young\", \"middle-aged\", \"old\"]\n", "num_bins = 3" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([ 0.42 , 26.94666667, 53.47333333, 80. ]),\n", " array([319, 523, 50]))" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hist1, bins1 = np.histogram(titanic[\"Age\"].fillna(titanic[\"Age\"].median()), bins=num_bins)\n", "bins1, hist1" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeAge
122.0(0.42, 26.947]
238.0(26.947, 53.473]
326.0(0.42, 26.947]
435.0(26.947, 53.473]
535.0(26.947, 53.473]
6NaNNaN
754.0(53.473, 80.0]
82.0(0.42, 26.947]
927.0(26.947, 53.473]
1014.0(0.42, 26.947]
114.0(0.42, 26.947]
1258.0(53.473, 80.0]
1320.0(0.42, 26.947]
1439.0(26.947, 53.473]
1514.0(0.42, 26.947]
1655.0(53.473, 80.0]
172.0(0.42, 26.947]
18NaNNaN
1931.0(26.947, 53.473]
20NaNNaN
\n", "
" ], "text/plain": [ " Age Age\n", "1 22.0 (0.42, 26.947]\n", "2 38.0 (26.947, 53.473]\n", "3 26.0 (0.42, 26.947]\n", "4 35.0 (26.947, 53.473]\n", "5 35.0 (26.947, 53.473]\n", "6 NaN NaN\n", "7 54.0 (53.473, 80.0]\n", "8 2.0 (0.42, 26.947]\n", "9 27.0 (26.947, 53.473]\n", "10 14.0 (0.42, 26.947]\n", "11 4.0 (0.42, 26.947]\n", "12 58.0 (53.473, 80.0]\n", "13 20.0 (0.42, 26.947]\n", "14 39.0 (26.947, 53.473]\n", "15 14.0 (0.42, 26.947]\n", "16 55.0 (53.473, 80.0]\n", "17 2.0 (0.42, 26.947]\n", "18 NaN NaN\n", "19 31.0 (26.947, 53.473]\n", "20 NaN NaN" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins1))], axis=1).head(20)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeAge
122.0young
238.0middle-aged
326.0young
435.0middle-aged
535.0middle-aged
6NaNNaN
754.0old
82.0young
927.0middle-aged
1014.0young
114.0young
1258.0old
1320.0young
1439.0middle-aged
1514.0young
1655.0old
172.0young
18NaNNaN
1931.0middle-aged
20NaNNaN
\n", "
" ], "text/plain": [ " Age Age\n", "1 22.0 young\n", "2 38.0 middle-aged\n", "3 26.0 young\n", "4 35.0 middle-aged\n", "5 35.0 middle-aged\n", "6 NaN NaN\n", "7 54.0 old\n", "8 2.0 young\n", "9 27.0 middle-aged\n", "10 14.0 young\n", "11 4.0 young\n", "12 58.0 old\n", "13 20.0 young\n", "14 39.0 middle-aged\n", "15 14.0 young\n", "16 55.0 old\n", "17 2.0 young\n", "18 NaN NaN\n", "19 31.0 middle-aged\n", "20 NaN NaN" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins1), labels=labels)], axis=1).head(20)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Равномерное разделение данных на 3 группы c установкой собственной границы диапазона значений (от 0 до 100)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([ 0. , 33.33333333, 66.66666667, 100. ]),\n", " array([641, 244, 7]))" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bins2 = np.linspace(0, 100, 4)\n", "tmp_bins2 = np.digitize(titanic[\"Age\"].fillna(titanic[\"Age\"].median()), bins2)\n", "hist2 = np.bincount(tmp_bins2 - 1)\n", "bins2, hist2" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeAge
122.0(0.0, 33.333]
238.0(33.333, 66.667]
326.0(0.0, 33.333]
435.0(33.333, 66.667]
535.0(33.333, 66.667]
6NaNNaN
754.0(33.333, 66.667]
82.0(0.0, 33.333]
927.0(0.0, 33.333]
1014.0(0.0, 33.333]
114.0(0.0, 33.333]
1258.0(33.333, 66.667]
1320.0(0.0, 33.333]
1439.0(33.333, 66.667]
1514.0(0.0, 33.333]
1655.0(33.333, 66.667]
172.0(0.0, 33.333]
18NaNNaN
1931.0(0.0, 33.333]
20NaNNaN
\n", "
" ], "text/plain": [ " Age Age\n", "1 22.0 (0.0, 33.333]\n", "2 38.0 (33.333, 66.667]\n", "3 26.0 (0.0, 33.333]\n", "4 35.0 (33.333, 66.667]\n", "5 35.0 (33.333, 66.667]\n", "6 NaN NaN\n", "7 54.0 (33.333, 66.667]\n", "8 2.0 (0.0, 33.333]\n", "9 27.0 (0.0, 33.333]\n", "10 14.0 (0.0, 33.333]\n", "11 4.0 (0.0, 33.333]\n", "12 58.0 (33.333, 66.667]\n", "13 20.0 (0.0, 33.333]\n", "14 39.0 (33.333, 66.667]\n", "15 14.0 (0.0, 33.333]\n", "16 55.0 (33.333, 66.667]\n", "17 2.0 (0.0, 33.333]\n", "18 NaN NaN\n", "19 31.0 (0.0, 33.333]\n", "20 NaN NaN" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins2))], axis=1).head(20)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeAge
122.0young
238.0middle-aged
326.0young
435.0middle-aged
535.0middle-aged
6NaNNaN
754.0middle-aged
82.0young
927.0young
1014.0young
114.0young
1258.0middle-aged
1320.0young
1439.0middle-aged
1514.0young
1655.0middle-aged
172.0young
18NaNNaN
1931.0young
20NaNNaN
\n", "
" ], "text/plain": [ " Age Age\n", "1 22.0 young\n", "2 38.0 middle-aged\n", "3 26.0 young\n", "4 35.0 middle-aged\n", "5 35.0 middle-aged\n", "6 NaN NaN\n", "7 54.0 middle-aged\n", "8 2.0 young\n", "9 27.0 young\n", "10 14.0 young\n", "11 4.0 young\n", "12 58.0 middle-aged\n", "13 20.0 young\n", "14 39.0 middle-aged\n", "15 14.0 young\n", "16 55.0 middle-aged\n", "17 2.0 young\n", "18 NaN NaN\n", "19 31.0 young\n", "20 NaN NaN" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins2), labels=labels)], axis=1).head(20)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Равномерное разделение данных на 3 группы c установкой собственных интервалов (0 - 39, 40 - 60, 61 - 100)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([ 0, 40, 60, 100]), array([729, 137, 26]))" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hist3, bins3 = np.histogram(\n", " titanic[\"Age\"].fillna(titanic[\"Age\"].median()), bins=[0, 40, 60, 100]\n", ")\n", "bins3, hist3" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeAge
122.0(0.0, 40.0]
238.0(0.0, 40.0]
326.0(0.0, 40.0]
435.0(0.0, 40.0]
535.0(0.0, 40.0]
6NaNNaN
754.0(40.0, 60.0]
82.0(0.0, 40.0]
927.0(0.0, 40.0]
1014.0(0.0, 40.0]
114.0(0.0, 40.0]
1258.0(40.0, 60.0]
1320.0(0.0, 40.0]
1439.0(0.0, 40.0]
1514.0(0.0, 40.0]
1655.0(40.0, 60.0]
172.0(0.0, 40.0]
18NaNNaN
1931.0(0.0, 40.0]
20NaNNaN
\n", "
" ], "text/plain": [ " Age Age\n", "1 22.0 (0.0, 40.0]\n", "2 38.0 (0.0, 40.0]\n", "3 26.0 (0.0, 40.0]\n", "4 35.0 (0.0, 40.0]\n", "5 35.0 (0.0, 40.0]\n", "6 NaN NaN\n", "7 54.0 (40.0, 60.0]\n", "8 2.0 (0.0, 40.0]\n", "9 27.0 (0.0, 40.0]\n", "10 14.0 (0.0, 40.0]\n", "11 4.0 (0.0, 40.0]\n", "12 58.0 (40.0, 60.0]\n", "13 20.0 (0.0, 40.0]\n", "14 39.0 (0.0, 40.0]\n", "15 14.0 (0.0, 40.0]\n", "16 55.0 (40.0, 60.0]\n", "17 2.0 (0.0, 40.0]\n", "18 NaN NaN\n", "19 31.0 (0.0, 40.0]\n", "20 NaN NaN" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins3))], axis=1).head(20)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeAge
122.0young
238.0young
326.0young
435.0young
535.0young
6NaNNaN
754.0middle-aged
82.0young
927.0young
1014.0young
114.0young
1258.0middle-aged
1320.0young
1439.0young
1514.0young
1655.0middle-aged
172.0young
18NaNNaN
1931.0young
20NaNNaN
\n", "
" ], "text/plain": [ " Age Age\n", "1 22.0 young\n", "2 38.0 young\n", "3 26.0 young\n", "4 35.0 young\n", "5 35.0 young\n", "6 NaN NaN\n", "7 54.0 middle-aged\n", "8 2.0 young\n", "9 27.0 young\n", "10 14.0 young\n", "11 4.0 young\n", "12 58.0 middle-aged\n", "13 20.0 young\n", "14 39.0 young\n", "15 14.0 young\n", "16 55.0 middle-aged\n", "17 2.0 young\n", "18 NaN NaN\n", "19 31.0 young\n", "20 NaN NaN" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins3), labels=labels)], axis=1).head(20)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Квантильное разделение данных на 3 группы" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeAge
122.00.0
238.02.0
326.01.0
435.02.0
535.02.0
6NaNNaN
754.02.0
82.00.0
927.01.0
1014.00.0
114.00.0
1258.02.0
1320.00.0
1439.02.0
1514.00.0
1655.02.0
172.00.0
18NaNNaN
1931.01.0
20NaNNaN
\n", "
" ], "text/plain": [ " Age Age\n", "1 22.0 0.0\n", "2 38.0 2.0\n", "3 26.0 1.0\n", "4 35.0 2.0\n", "5 35.0 2.0\n", "6 NaN NaN\n", "7 54.0 2.0\n", "8 2.0 0.0\n", "9 27.0 1.0\n", "10 14.0 0.0\n", "11 4.0 0.0\n", "12 58.0 2.0\n", "13 20.0 0.0\n", "14 39.0 2.0\n", "15 14.0 0.0\n", "16 55.0 2.0\n", "17 2.0 0.0\n", "18 NaN NaN\n", "19 31.0 1.0\n", "20 NaN NaN" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([titanic[\"Age\"], pd.qcut(titanic[\"Age\"], q=3, labels=False)], axis=1).head(20)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeAge
122.0young
238.0old
326.0middle-aged
435.0old
535.0old
6NaNNaN
754.0old
82.0young
927.0middle-aged
1014.0young
114.0young
1258.0old
1320.0young
1439.0old
1514.0young
1655.0old
172.0young
18NaNNaN
1931.0middle-aged
20NaNNaN
\n", "
" ], "text/plain": [ " Age Age\n", "1 22.0 young\n", "2 38.0 old\n", "3 26.0 middle-aged\n", "4 35.0 old\n", "5 35.0 old\n", "6 NaN NaN\n", "7 54.0 old\n", "8 2.0 young\n", "9 27.0 middle-aged\n", "10 14.0 young\n", "11 4.0 young\n", "12 58.0 old\n", "13 20.0 young\n", "14 39.0 old\n", "15 14.0 young\n", "16 55.0 old\n", "17 2.0 young\n", "18 NaN NaN\n", "19 31.0 middle-aged\n", "20 NaN NaN" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([titanic[\"Age\"], pd.qcut(titanic[\"Age\"], q=3, labels=labels)], axis=1).head(20)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Пример конструирования признаков на основе существующих\n", "\n", "Title - обращение к пассажиру (Mr, Mrs, Miss)\n", "\n", "Is_married - замужняя ли женщина\n", "\n", "Cabin_type - палуба (тип каюты)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedTitleIs_marriedCabin_type
21.01.0Cumings, Mrs. John Bradley (Florence Briggs Th...female38.01.00.0PC 1759971.2833C85CMrs1C
41.01.0Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01.00.011380353.1000C123SMrs1C
70.01.0McCarthy, Mr. Timothy Jmale54.00.00.01746351.8625E46SMr0E
111.03.0Sandstrom, Miss. Marguerite Rutfemale4.01.01.0PP 954916.7000G6SMiss0G
121.01.0Bonnell, Miss. Elizabethfemale58.00.00.011378326.5500C103SMiss0C
.............................................
8721.01.0Beckwith, Mrs. Richard Leonard (Sallie Monypeny)female47.01.01.01175152.5542D35SMrs1D
8730.01.0Carlsson, Mr. Frans Olofmale33.00.00.06955.0000B51 B53 B55SMr0B
8801.01.0Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)female56.00.01.01176783.1583C50CMrs1C
8881.01.0Graham, Miss. Margaret Edithfemale19.00.00.011205330.0000B42SMiss0B
8901.01.0Behr, Mr. Karl Howellmale26.00.00.011136930.0000C148CMr0C
\n", "

183 rows × 14 columns

\n", "
" ], "text/plain": [ " Survived Pclass Name \\\n", "2 1.0 1.0 Cumings, Mrs. John Bradley (Florence Briggs Th... \n", "4 1.0 1.0 Futrelle, Mrs. Jacques Heath (Lily May Peel) \n", "7 0.0 1.0 McCarthy, Mr. Timothy J \n", "11 1.0 3.0 Sandstrom, Miss. Marguerite Rut \n", "12 1.0 1.0 Bonnell, Miss. Elizabeth \n", ".. ... ... ... \n", "872 1.0 1.0 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) \n", "873 0.0 1.0 Carlsson, Mr. Frans Olof \n", "880 1.0 1.0 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) \n", "888 1.0 1.0 Graham, Miss. Margaret Edith \n", "890 1.0 1.0 Behr, Mr. Karl Howell \n", "\n", " Sex Age SibSp Parch Ticket Fare Cabin Embarked \\\n", "2 female 38.0 1.0 0.0 PC 17599 71.2833 C85 C \n", "4 female 35.0 1.0 0.0 113803 53.1000 C123 S \n", "7 male 54.0 0.0 0.0 17463 51.8625 E46 S \n", "11 female 4.0 1.0 1.0 PP 9549 16.7000 G6 S \n", "12 female 58.0 0.0 0.0 113783 26.5500 C103 S \n", ".. ... ... ... ... ... ... ... ... \n", "872 female 47.0 1.0 1.0 11751 52.5542 D35 S \n", "873 male 33.0 0.0 0.0 695 5.0000 B51 B53 B55 S \n", "880 female 56.0 0.0 1.0 11767 83.1583 C50 C \n", "888 female 19.0 0.0 0.0 112053 30.0000 B42 S \n", "890 male 26.0 0.0 0.0 111369 30.0000 C148 C \n", "\n", " Title Is_married Cabin_type \n", "2 Mrs 1 C \n", "4 Mrs 1 C \n", "7 Mr 0 E \n", "11 Miss 0 G \n", "12 Miss 0 C \n", ".. ... ... ... \n", "872 Mrs 1 D \n", "873 Mr 0 B \n", "880 Mrs 1 C \n", "888 Miss 0 B \n", "890 Mr 0 C \n", "\n", "[183 rows x 14 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic_cl = titanic.drop(\n", " [\"Embarked_Q\", \"Embarked_S\", \"Embarked_nan\", \"Sex_male\"], axis=1, errors=\"ignore\"\n", ")\n", "titanic_cl = titanic_cl.dropna()\n", "\n", "titanic_cl[\"Title\"] = [\n", " i.split(\",\")[1].split(\".\")[0].strip() for i in titanic_cl[\"Name\"]\n", "]\n", "\n", "titanic_cl[\"Is_married\"] = [1 if i == \"Mrs\" else 0 for i in titanic_cl[\"Title\"]]\n", "\n", "titanic_cl[\"Cabin_type\"] = [i[0] for i in titanic_cl[\"Cabin\"]]\n", "\n", "titanic_cl" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Пример использования библиотеки Featuretools для автоматического конструирования (синтеза) признаков\n", "\n", "https://featuretools.alteryx.com/en/stable/getting_started/using_entitysets.html" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Загрузка данных\n", "\n", "За основу был взят набор данных \"Ecommerce Orders Data Set\" из Kaggle\n", "\n", "Используется только 100 первых заказов и связанные с ними объекты\n", "\n", "https://www.kaggle.com/datasets/sangamsharmait/ecommerce-orders-data-analysis" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "import featuretools as ft\n", "from woodwork.logical_types import Categorical, Datetime\n", "\n", "customers = pd.read_csv(\"data/orders/customers.csv\")\n", "sellers = pd.read_csv(\"data/orders/sellers.csv\")\n", "products = pd.read_csv(\"data/orders/products.csv\")\n", "orders = pd.read_csv(\"data/orders/orders.csv\")\n", "orders.fillna({\"order_delivered_carrier_date\": pd.to_datetime(\n", " \"1900-01-01 00:00:00\"\n", ")}, inplace=True)\n", "orders.fillna(\n", " {\"order_delivered_customer_date\": pd.to_datetime(\"1900-01-01 00:00:00\")},\n", " inplace=True,\n", ")\n", "order_items = pd.read_csv(\"data/orders/order_items.csv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Создание сущностей в featuretools\n", "\n", "Добавление dataframe'ов с данными в EntitySet с указанием параметров: название сущности (таблицы), первичный ключ, категориальные атрибуты (в том числе даты)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n" ] }, { "data": { "text/plain": [ "Entityset: orders\n", " DataFrames:\n", " customers [Rows: 100, Columns: 5]\n", " sellers [Rows: 87, Columns: 4]\n", " products [Rows: 100, Columns: 9]\n", " orders [Rows: 100, Columns: 8]\n", " order_items [Rows: 115, Columns: 8]\n", " Relationships:\n", " No relationships" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "es = ft.EntitySet(id=\"orders\")\n", "\n", "es = es.add_dataframe(\n", " dataframe_name=\"customers\",\n", " dataframe=customers,\n", " index=\"customer_id\",\n", " logical_types={\n", " \"customer_unique_id\": Categorical,\n", " \"customer_zip_code_prefix\": Categorical,\n", " \"customer_city\": Categorical,\n", " \"customer_state\": Categorical,\n", " },\n", ")\n", "es = es.add_dataframe(\n", " dataframe_name=\"sellers\",\n", " dataframe=sellers,\n", " index=\"seller_id\",\n", " logical_types={\n", " \"seller_zip_code_prefix\": Categorical,\n", " \"seller_city\": Categorical,\n", " \"seller_state\": Categorical,\n", " },\n", ")\n", "es = es.add_dataframe(\n", " dataframe_name=\"products\",\n", " dataframe=products,\n", " index=\"product_id\",\n", " logical_types={\n", " \"product_category_name\": Categorical,\n", " \"product_name_lenght\": Categorical,\n", " \"product_description_lenght\": Categorical,\n", " \"product_photos_qty\": Categorical,\n", " },\n", ")\n", "es = es.add_dataframe(\n", " dataframe_name=\"orders\",\n", " dataframe=orders,\n", " index=\"order_id\",\n", " logical_types={\n", " \"order_status\": Categorical,\n", " \"order_purchase_timestamp\": Datetime,\n", " \"order_approved_at\": Datetime,\n", " \"order_delivered_carrier_date\": Datetime,\n", " \"order_delivered_customer_date\": Datetime,\n", " \"order_estimated_delivery_date\": Datetime,\n", " },\n", ")\n", "es = es.add_dataframe(\n", " dataframe_name=\"order_items\",\n", " dataframe=order_items,\n", " index=\"orderitem_id\",\n", " make_index=True,\n", " logical_types={\"shipping_limit_date\": Datetime},\n", ")\n", "\n", "es" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Настройка связей между сущностями featuretools\n", "\n", "Настройка связей между таблицами на уровне ключей\n", "\n", "Связь указывается от родителя к потомкам (таблица-родитель, первичный ключ, таблица-потомок, внешний ключ)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Entityset: orders\n", " DataFrames:\n", " customers [Rows: 100, Columns: 5]\n", " sellers [Rows: 87, Columns: 4]\n", " products [Rows: 100, Columns: 9]\n", " orders [Rows: 100, Columns: 8]\n", " order_items [Rows: 115, Columns: 8]\n", " Relationships:\n", " orders.customer_id -> customers.customer_id\n", " order_items.order_id -> orders.order_id\n", " order_items.product_id -> products.product_id\n", " order_items.seller_id -> sellers.seller_id" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "es = es.add_relationship(\"customers\", \"customer_id\", \"orders\", \"customer_id\")\n", "es = es.add_relationship(\"orders\", \"order_id\", \"order_items\", \"order_id\")\n", "es = es.add_relationship(\"products\", \"product_id\", \"order_items\", \"product_id\")\n", "es = es.add_relationship(\"sellers\", \"seller_id\", \"order_items\", \"seller_id\")\n", "\n", "es" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Автоматическое конструирование признаков с помощью featuretools\n", "\n", "Библиотека применят различные функции агрегации и трансформации к атрибутам таблицы order_items с учетом отношений\n", "\n", "Результат помещается в Dataframe feature_matrix" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\featuretools\\synthesis\\dfs.py:321: UnusedPrimitiveWarning: Some specified primitives were not used during DFS:\n", " agg_primitives: ['any', 'mode']\n", "This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.\n", " warnings.warn(warning_msg, UnusedPrimitiveWarning)\n", "c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n", " ).agg(to_agg)\n", "c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n", " ).agg(to_agg)\n", "c:\\Users\\user\\Projects\\python\\ckmai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n", " ).agg(to_agg)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_item_idpricefreight_valueHOUR(shipping_limit_date)WEEKDAY(shipping_limit_date)orders.order_statusproducts.product_category_nameproducts.product_name_lenghtproducts.product_description_lenghtproducts.product_photos_qty...orders.customers.customer_cityorders.customers.customer_stateproducts.COUNT(order_items)products.MEAN(order_items.freight_value)products.MEAN(order_items.order_item_id)products.MEAN(order_items.price)sellers.COUNT(order_items)sellers.MEAN(order_items.freight_value)sellers.MEAN(order_items.order_item_id)sellers.MEAN(order_items.price)
orderitem_id
0138.5024.84204deliveredcama_mesa_banho53.0223.01.0...santa luziaPB124.841.038.50221.3401.061.200000
1129.997.3980deliveredtelefonia59.0675.05.0...sao pauloSP17.391.029.9917.3901.029.990000
21110.9921.27211deliveredcama_mesa_banho52.0413.01.0...gravataiRS121.271.0110.99121.2701.0110.990000
3127.9915.10231deliveredtelefonia60.0818.06.0...imbitubaSC115.101.027.99213.9701.026.490000
4149.9016.05132invoicedNaNNaNNaNNaN...santa rosaRS116.051.049.90116.0501.049.900000
..................................................................
110117.9010.9681deliveredcama_mesa_banho55.0122.01.0...jundiaiSP110.961.017.90110.9601.017.900000
111179.998.9194deliveredbeleza_saude59.0492.03.0...sao pauloSP18.911.079.99513.2061.254.590000
1121190.0019.41133deliveredclimatizacao60.03270.04.0...pauliniaSP119.411.0190.00119.4101.0190.000000
1131109.9015.5322deliveredcool_stuff46.0595.02.0...rio de janeiroRJ115.531.0109.90115.5301.0109.900000
114127.9018.30142deliveredalimentos59.0982.01.0...joinvilleSC216.701.027.90316.1901.038.596667
\n", "

115 rows × 43 columns

\n", "
" ], "text/plain": [ " order_item_id price freight_value HOUR(shipping_limit_date) \\\n", "orderitem_id \n", "0 1 38.50 24.84 20 \n", "1 1 29.99 7.39 8 \n", "2 1 110.99 21.27 21 \n", "3 1 27.99 15.10 23 \n", "4 1 49.90 16.05 13 \n", "... ... ... ... ... \n", "110 1 17.90 10.96 8 \n", "111 1 79.99 8.91 9 \n", "112 1 190.00 19.41 13 \n", "113 1 109.90 15.53 2 \n", "114 1 27.90 18.30 14 \n", "\n", " WEEKDAY(shipping_limit_date) orders.order_status \\\n", "orderitem_id \n", "0 4 delivered \n", "1 0 delivered \n", "2 1 delivered \n", "3 1 delivered \n", "4 2 invoiced \n", "... ... ... \n", "110 1 delivered \n", "111 4 delivered \n", "112 3 delivered \n", "113 2 delivered \n", "114 2 delivered \n", "\n", " products.product_category_name products.product_name_lenght \\\n", "orderitem_id \n", "0 cama_mesa_banho 53.0 \n", "1 telefonia 59.0 \n", "2 cama_mesa_banho 52.0 \n", "3 telefonia 60.0 \n", "4 NaN NaN \n", "... ... ... \n", "110 cama_mesa_banho 55.0 \n", "111 beleza_saude 59.0 \n", "112 climatizacao 60.0 \n", "113 cool_stuff 46.0 \n", "114 alimentos 59.0 \n", "\n", " products.product_description_lenght products.product_photos_qty \\\n", "orderitem_id \n", "0 223.0 1.0 \n", "1 675.0 5.0 \n", "2 413.0 1.0 \n", "3 818.0 6.0 \n", "4 NaN NaN \n", "... ... ... \n", "110 122.0 1.0 \n", "111 492.0 3.0 \n", "112 3270.0 4.0 \n", "113 595.0 2.0 \n", "114 982.0 1.0 \n", "\n", " ... orders.customers.customer_city \\\n", "orderitem_id ... \n", "0 ... santa luzia \n", "1 ... sao paulo \n", "2 ... gravatai \n", "3 ... imbituba \n", "4 ... santa rosa \n", "... ... ... \n", "110 ... jundiai \n", "111 ... sao paulo \n", "112 ... paulinia \n", "113 ... rio de janeiro \n", "114 ... joinville \n", "\n", " orders.customers.customer_state products.COUNT(order_items) \\\n", "orderitem_id \n", "0 PB 1 \n", "1 SP 1 \n", "2 RS 1 \n", "3 SC 1 \n", "4 RS 1 \n", "... ... ... \n", "110 SP 1 \n", "111 SP 1 \n", "112 SP 1 \n", "113 RJ 1 \n", "114 SC 2 \n", "\n", " products.MEAN(order_items.freight_value) \\\n", "orderitem_id \n", "0 24.84 \n", "1 7.39 \n", "2 21.27 \n", "3 15.10 \n", "4 16.05 \n", "... ... \n", "110 10.96 \n", "111 8.91 \n", "112 19.41 \n", "113 15.53 \n", "114 16.70 \n", "\n", " products.MEAN(order_items.order_item_id) \\\n", "orderitem_id \n", "0 1.0 \n", "1 1.0 \n", "2 1.0 \n", "3 1.0 \n", "4 1.0 \n", "... ... \n", "110 1.0 \n", "111 1.0 \n", "112 1.0 \n", "113 1.0 \n", "114 1.0 \n", "\n", " products.MEAN(order_items.price) sellers.COUNT(order_items) \\\n", "orderitem_id \n", "0 38.50 2 \n", "1 29.99 1 \n", "2 110.99 1 \n", "3 27.99 2 \n", "4 49.90 1 \n", "... ... ... \n", "110 17.90 1 \n", "111 79.99 5 \n", "112 190.00 1 \n", "113 109.90 1 \n", "114 27.90 3 \n", "\n", " sellers.MEAN(order_items.freight_value) \\\n", "orderitem_id \n", "0 21.340 \n", "1 7.390 \n", "2 21.270 \n", "3 13.970 \n", "4 16.050 \n", "... ... \n", "110 10.960 \n", "111 13.206 \n", "112 19.410 \n", "113 15.530 \n", "114 16.190 \n", "\n", " sellers.MEAN(order_items.order_item_id) \\\n", "orderitem_id \n", "0 1.0 \n", "1 1.0 \n", "2 1.0 \n", "3 1.0 \n", "4 1.0 \n", "... ... \n", "110 1.0 \n", "111 1.2 \n", "112 1.0 \n", "113 1.0 \n", "114 1.0 \n", "\n", " sellers.MEAN(order_items.price) \n", "orderitem_id \n", "0 61.200000 \n", "1 29.990000 \n", "2 110.990000 \n", "3 26.490000 \n", "4 49.900000 \n", "... ... \n", "110 17.900000 \n", "111 54.590000 \n", "112 190.000000 \n", "113 109.900000 \n", "114 38.596667 \n", "\n", "[115 rows x 43 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_matrix, feature_defs = ft.dfs(\n", " entityset=es,\n", " target_dataframe_name=\"order_items\",\n", " agg_primitives=[\"mean\", \"count\", \"mode\", \"any\"],\n", " trans_primitives=[\"hour\", \"weekday\"],\n", " max_depth=2,\n", ")\n", "\n", "feature_matrix" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Полученные признаки\n", "\n", "Список колонок полученного dataframe'а" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_defs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Отсечение значений признаков" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Определение выбросов с помощью boxplot" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "titanic.boxplot(column=\"Age\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Отсечение данных для признака Возраст, значение которых больше 65 лет" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameAgeAgeClip
34Wheadon, Mr. Edward H66.065.0
97Goldschmidt, Mr. George B71.065.0
117Connors, Mr. Patrick70.565.0
494Artagaveytia, Mr. Ramon71.065.0
631Barkworth, Mr. Algernon Henry Wilson80.065.0
673Mitchell, Mr. Henry Michael70.065.0
746Crosby, Capt. Edward Gifford70.065.0
852Svensson, Mr. Johan74.065.0
\n", "
" ], "text/plain": [ " Name Age AgeClip\n", "34 Wheadon, Mr. Edward H 66.0 65.0\n", "97 Goldschmidt, Mr. George B 71.0 65.0\n", "117 Connors, Mr. Patrick 70.5 65.0\n", "494 Artagaveytia, Mr. Ramon 71.0 65.0\n", "631 Barkworth, Mr. Algernon Henry Wilson 80.0 65.0\n", "673 Mitchell, Mr. Henry Michael 70.0 65.0\n", "746 Crosby, Capt. Edward Gifford 70.0 65.0\n", "852 Svensson, Mr. Johan 74.0 65.0" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic_norm = titanic.copy()\n", "\n", "titanic_norm[\"AgeClip\"] = titanic[\"Age\"].clip(0, 65);\n", "\n", "titanic_norm[titanic_norm[\"Age\"] > 65][[\"Name\", \"Age\", \"AgeClip\"]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Винсоризация признака Возраст" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "56.0\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameAgeAgeWinsorize
34Wheadon, Mr. Edward H66.054.0
97Goldschmidt, Mr. George B71.054.0
117Connors, Mr. Patrick70.554.0
494Artagaveytia, Mr. Ramon71.054.0
631Barkworth, Mr. Algernon Henry Wilson80.054.0
673Mitchell, Mr. Henry Michael70.054.0
746Crosby, Capt. Edward Gifford70.054.0
852Svensson, Mr. Johan74.054.0
\n", "
" ], "text/plain": [ " Name Age AgeWinsorize\n", "34 Wheadon, Mr. Edward H 66.0 54.0\n", "97 Goldschmidt, Mr. George B 71.0 54.0\n", "117 Connors, Mr. Patrick 70.5 54.0\n", "494 Artagaveytia, Mr. Ramon 71.0 54.0\n", "631 Barkworth, Mr. Algernon Henry Wilson 80.0 54.0\n", "673 Mitchell, Mr. Henry Michael 70.0 54.0\n", "746 Crosby, Capt. Edward Gifford 70.0 54.0\n", "852 Svensson, Mr. Johan 74.0 54.0" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from scipy.stats.mstats import winsorize\n", "\n", "print(titanic_norm[\"Age\"].quantile(q=0.95))\n", "\n", "titanic_norm[\"AgeWinsorize\"] = winsorize(\n", " titanic_norm[\"Age\"].fillna(titanic_norm[\"Age\"].mean()), (0, 0.05), inplace=False\n", ")\n", "\n", "titanic_norm[titanic_norm[\"Age\"] > 65][[\"Name\", \"Age\", \"AgeWinsorize\"]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Нормализация значений" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameAgeAgeNormAgeClipNormAgeWinsorizeNormAgeWinsorizeNorm2
1Braund, Mr. Owen Harris22.00.2711740.3341590.402762-0.194476
2Cumings, Mrs. John Bradley (Florence Briggs Th...38.00.4722290.5819140.7013810.402762
3Heikkinen, Miss. Laina26.00.3214380.3960980.477417-0.045166
4Futrelle, Mrs. Jacques Heath (Lily May Peel)35.00.4345310.5354600.6453900.290780
5Allen, Mr. William Henry35.00.4345310.5354600.6453900.290780
6Moran, Mr. JamesNaNNaNNaN0.5464560.092912
7McCarthy, Mr. Timothy J54.00.6732850.8296691.0000001.000000
8Palsson, Master. Gosta Leonard2.00.0198540.0244660.029489-0.941023
9Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)27.00.3340040.4115830.496081-0.007839
10Nasser, Mrs. Nicholas (Adele Achem)14.00.1706460.2102820.253453-0.493094
11Sandstrom, Miss. Marguerite Rut4.00.0449860.0554350.066816-0.866368
12Bonnell, Miss. Elizabeth58.00.7235490.8916071.0000001.000000
13Saundercock, Mr. William Henry20.00.2460420.3031900.365435-0.269130
14Andersson, Mr. Anders Johan39.00.4847950.5973990.7200450.440090
15Vestrom, Miss. Hulda Amanda Adolfina14.00.1706460.2102820.253453-0.493094
16Hewlett, Mrs. (Mary D Kingcome)55.00.6858510.8451531.0000001.000000
17Rice, Master. Eugene2.00.0198540.0244660.029489-0.941023
18Williams, Mr. Charles EugeneNaNNaNNaN0.5464560.092912
19Vander Planke, Mrs. Julius (Emelia Maria Vande...31.00.3842670.4735210.5707350.141471
20Masselmani, Mrs. FatimaNaNNaNNaN0.5464560.092912
\n", "
" ], "text/plain": [ " Name Age AgeNorm \\\n", "1 Braund, Mr. Owen Harris 22.0 0.271174 \n", "2 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 0.472229 \n", "3 Heikkinen, Miss. Laina 26.0 0.321438 \n", "4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 0.434531 \n", "5 Allen, Mr. William Henry 35.0 0.434531 \n", "6 Moran, Mr. James NaN NaN \n", "7 McCarthy, Mr. Timothy J 54.0 0.673285 \n", "8 Palsson, Master. Gosta Leonard 2.0 0.019854 \n", "9 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 27.0 0.334004 \n", "10 Nasser, Mrs. Nicholas (Adele Achem) 14.0 0.170646 \n", "11 Sandstrom, Miss. Marguerite Rut 4.0 0.044986 \n", "12 Bonnell, Miss. Elizabeth 58.0 0.723549 \n", "13 Saundercock, Mr. William Henry 20.0 0.246042 \n", "14 Andersson, Mr. Anders Johan 39.0 0.484795 \n", "15 Vestrom, Miss. Hulda Amanda Adolfina 14.0 0.170646 \n", "16 Hewlett, Mrs. (Mary D Kingcome) 55.0 0.685851 \n", "17 Rice, Master. Eugene 2.0 0.019854 \n", "18 Williams, Mr. Charles Eugene NaN NaN \n", "19 Vander Planke, Mrs. Julius (Emelia Maria Vande... 31.0 0.384267 \n", "20 Masselmani, Mrs. Fatima NaN NaN \n", "\n", " AgeClipNorm AgeWinsorizeNorm AgeWinsorizeNorm2 \n", "1 0.334159 0.402762 -0.194476 \n", "2 0.581914 0.701381 0.402762 \n", "3 0.396098 0.477417 -0.045166 \n", "4 0.535460 0.645390 0.290780 \n", "5 0.535460 0.645390 0.290780 \n", "6 NaN 0.546456 0.092912 \n", "7 0.829669 1.000000 1.000000 \n", "8 0.024466 0.029489 -0.941023 \n", "9 0.411583 0.496081 -0.007839 \n", "10 0.210282 0.253453 -0.493094 \n", "11 0.055435 0.066816 -0.866368 \n", "12 0.891607 1.000000 1.000000 \n", "13 0.303190 0.365435 -0.269130 \n", "14 0.597399 0.720045 0.440090 \n", "15 0.210282 0.253453 -0.493094 \n", "16 0.845153 1.000000 1.000000 \n", "17 0.024466 0.029489 -0.941023 \n", "18 NaN 0.546456 0.092912 \n", "19 0.473521 0.570735 0.141471 \n", "20 NaN 0.546456 0.092912 " ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn import preprocessing\n", "\n", "min_max_scaler = preprocessing.MinMaxScaler()\n", "\n", "min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))\n", "\n", "titanic_norm[\"AgeNorm\"] = min_max_scaler.fit_transform(\n", " titanic_norm[\"Age\"].to_numpy().reshape(-1, 1)\n", ").reshape(titanic_norm[\"Age\"].shape)\n", "\n", "titanic_norm[\"AgeClipNorm\"] = min_max_scaler.fit_transform(\n", " titanic_norm[\"AgeClip\"].to_numpy().reshape(-1, 1)\n", ").reshape(titanic_norm[\"Age\"].shape)\n", "\n", "titanic_norm[\"AgeWinsorizeNorm\"] = min_max_scaler.fit_transform(\n", " titanic_norm[\"AgeWinsorize\"].to_numpy().reshape(-1, 1)\n", ").reshape(titanic_norm[\"Age\"].shape)\n", "\n", "titanic_norm[\"AgeWinsorizeNorm2\"] = min_max_scaler_2.fit_transform(\n", " titanic_norm[\"AgeWinsorize\"].to_numpy().reshape(-1, 1)\n", ").reshape(titanic_norm[\"Age\"].shape)\n", "\n", "titanic_norm[\n", " [\"Name\", \"Age\", \"AgeNorm\", \"AgeClipNorm\", \"AgeWinsorizeNorm\", \"AgeWinsorizeNorm2\"]\n", "].head(20)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Стандартизация значений" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameAgeAgeStandAgeClipStandAgeWinsorizeStand
1Braund, Mr. Owen Harris22.0-0.530377-0.532745-0.606602
2Cumings, Mrs. John Bradley (Florence Briggs Th...38.00.5718310.5850600.718863
3Heikkinen, Miss. Laina26.0-0.254825-0.253294-0.275236
4Futrelle, Mrs. Jacques Heath (Lily May Peel)35.00.3651670.3754720.470339
5Allen, Mr. William Henry35.00.3651670.3754720.470339
6Moran, Mr. JamesNaNNaNNaN0.031205
7McCarthy, Mr. Timothy J54.01.6740391.7028662.044329
8Palsson, Master. Gosta Leonard2.0-1.908136-1.930003-2.263435
9Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)27.0-0.185937-0.183431-0.192394
10Nasser, Mrs. Nicholas (Adele Achem)14.0-1.081480-1.091648-1.269335
11Sandstrom, Miss. Marguerite Rut4.0-1.770360-1.790277-2.097751
12Bonnell, Miss. Elizabeth58.01.9495911.9823172.044329
13Saundercock, Mr. William Henry20.0-0.668153-0.672471-0.772286
14Andersson, Mr. Anders Johan39.00.6407190.6549230.801705
15Vestrom, Miss. Hulda Amanda Adolfina14.0-1.081480-1.091648-1.269335
16Hewlett, Mrs. (Mary D Kingcome)55.01.7429271.7727292.044329
17Rice, Master. Eugene2.0-1.908136-1.930003-2.263435
18Williams, Mr. Charles EugeneNaNNaNNaN0.031205
19Vander Planke, Mrs. Julius (Emelia Maria Vande...31.00.0896150.0960200.138972
20Masselmani, Mrs. FatimaNaNNaNNaN0.031205
\n", "
" ], "text/plain": [ " Name Age AgeStand \\\n", "1 Braund, Mr. Owen Harris 22.0 -0.530377 \n", "2 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 0.571831 \n", "3 Heikkinen, Miss. Laina 26.0 -0.254825 \n", "4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 0.365167 \n", "5 Allen, Mr. William Henry 35.0 0.365167 \n", "6 Moran, Mr. James NaN NaN \n", "7 McCarthy, Mr. Timothy J 54.0 1.674039 \n", "8 Palsson, Master. Gosta Leonard 2.0 -1.908136 \n", "9 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 27.0 -0.185937 \n", "10 Nasser, Mrs. Nicholas (Adele Achem) 14.0 -1.081480 \n", "11 Sandstrom, Miss. Marguerite Rut 4.0 -1.770360 \n", "12 Bonnell, Miss. Elizabeth 58.0 1.949591 \n", "13 Saundercock, Mr. William Henry 20.0 -0.668153 \n", "14 Andersson, Mr. Anders Johan 39.0 0.640719 \n", "15 Vestrom, Miss. Hulda Amanda Adolfina 14.0 -1.081480 \n", "16 Hewlett, Mrs. (Mary D Kingcome) 55.0 1.742927 \n", "17 Rice, Master. Eugene 2.0 -1.908136 \n", "18 Williams, Mr. Charles Eugene NaN NaN \n", "19 Vander Planke, Mrs. Julius (Emelia Maria Vande... 31.0 0.089615 \n", "20 Masselmani, Mrs. Fatima NaN NaN \n", "\n", " AgeClipStand AgeWinsorizeStand \n", "1 -0.532745 -0.606602 \n", "2 0.585060 0.718863 \n", "3 -0.253294 -0.275236 \n", "4 0.375472 0.470339 \n", "5 0.375472 0.470339 \n", "6 NaN 0.031205 \n", "7 1.702866 2.044329 \n", "8 -1.930003 -2.263435 \n", "9 -0.183431 -0.192394 \n", "10 -1.091648 -1.269335 \n", "11 -1.790277 -2.097751 \n", "12 1.982317 2.044329 \n", "13 -0.672471 -0.772286 \n", "14 0.654923 0.801705 \n", "15 -1.091648 -1.269335 \n", "16 1.772729 2.044329 \n", "17 -1.930003 -2.263435 \n", "18 NaN 0.031205 \n", "19 0.096020 0.138972 \n", "20 NaN 0.031205 " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn import preprocessing\n", "\n", "stndart_scaler = preprocessing.StandardScaler()\n", "\n", "titanic_norm[\"AgeStand\"] = stndart_scaler.fit_transform(\n", " titanic_norm[\"Age\"].to_numpy().reshape(-1, 1)\n", ").reshape(titanic_norm[\"Age\"].shape)\n", "\n", "titanic_norm[\"AgeClipStand\"] = stndart_scaler.fit_transform(\n", " titanic_norm[\"AgeClip\"].to_numpy().reshape(-1, 1)\n", ").reshape(titanic_norm[\"Age\"].shape)\n", "\n", "titanic_norm[\"AgeWinsorizeStand\"] = stndart_scaler.fit_transform(\n", " titanic_norm[\"AgeWinsorize\"].to_numpy().reshape(-1, 1)\n", ").reshape(titanic_norm[\"Age\"].shape)\n", "\n", "titanic_norm[[\"Name\", \"Age\", \"AgeStand\", \"AgeClipStand\", \"AgeWinsorizeStand\"]].head(20)" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 2 }