From e5170c8c46469dc6195c624dcc883fc21faa9da8 Mon Sep 17 00:00:00 2001 From: Aleksey Filippov Date: Thu, 3 Oct 2024 01:14:22 +0400 Subject: [PATCH] Add feature engineering examples --- lec3.ipynb | 3107 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 3092 insertions(+), 15 deletions(-) diff --git a/lec3.ipynb b/lec3.ipynb index e76c2c8..25ba91c 100644 --- a/lec3.ipynb +++ b/lec3.ipynb @@ -4,7 +4,2227 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Пример использования библиотеки Featuretools для автоматического конструирования признаков\n", + "#### Унитарное кодирование\n", + "\n", + "Преобразование категориального признака в несколько бинарных признаков" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Загрузка набора данных Titanic" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
PassengerId
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
503Allen, Mr. William Henrymale35.0003734508.0500NaNS
....................................
88702Montvila, Rev. Juozasmale27.00021153613.0000NaNS
88811Graham, Miss. Margaret Edithfemale19.00011205330.0000B42S
88903Johnston, Miss. Catherine Helen \"Carrie\"femaleNaN12W./C. 660723.4500NaNS
89011Behr, Mr. Karl Howellmale26.00011136930.0000C148C
89103Dooley, Mr. Patrickmale32.0003703767.7500NaNQ
\n", + "

891 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Survived Pclass \\\n", + "PassengerId \n", + "1 0 3 \n", + "2 1 1 \n", + "3 1 3 \n", + "4 1 1 \n", + "5 0 3 \n", + "... ... ... \n", + "887 0 2 \n", + "888 1 1 \n", + "889 0 3 \n", + "890 1 1 \n", + "891 0 3 \n", + "\n", + " Name Sex Age \\\n", + "PassengerId \n", + "1 Braund, Mr. Owen Harris male 22.0 \n", + "2 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 \n", + "3 Heikkinen, Miss. Laina female 26.0 \n", + "4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 \n", + "5 Allen, Mr. William Henry male 35.0 \n", + "... ... ... ... \n", + "887 Montvila, Rev. Juozas male 27.0 \n", + "888 Graham, Miss. Margaret Edith female 19.0 \n", + "889 Johnston, Miss. Catherine Helen \"Carrie\" female NaN \n", + "890 Behr, Mr. Karl Howell male 26.0 \n", + "891 Dooley, Mr. Patrick male 32.0 \n", + "\n", + " SibSp Parch Ticket Fare Cabin Embarked \n", + "PassengerId \n", + "1 1 0 A/5 21171 7.2500 NaN S \n", + "2 1 0 PC 17599 71.2833 C85 C \n", + "3 0 0 STON/O2. 3101282 7.9250 NaN S \n", + "4 1 0 113803 53.1000 C123 S \n", + "5 0 0 373450 8.0500 NaN S \n", + "... ... ... ... ... ... ... \n", + "887 0 0 211536 13.0000 NaN S \n", + "888 0 0 112053 30.0000 B42 S \n", + "889 1 2 W./C. 6607 23.4500 NaN S \n", + "890 0 0 111369 30.0000 C148 C \n", + "891 0 0 370376 7.7500 NaN Q \n", + "\n", + "[891 rows x 11 columns]" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "titanic = pd.read_csv(\"data/titanic.csv\", index_col=\"PassengerId\")\n", + "\n", + "titanic" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Унитарное кодирование признаков Пол (Sex) и Порт посадки (Embarked)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Кодирование" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Embarked_QEmbarked_SEmbarked_nanSex_male
00.01.00.01.0
10.00.00.00.0
20.01.00.00.0
30.01.00.00.0
40.01.00.01.0
...............
8860.01.00.01.0
8870.01.00.00.0
8880.01.00.00.0
8890.00.00.01.0
8901.00.00.01.0
\n", + "

891 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " Embarked_Q Embarked_S Embarked_nan Sex_male\n", + "0 0.0 1.0 0.0 1.0\n", + "1 0.0 0.0 0.0 0.0\n", + "2 0.0 1.0 0.0 0.0\n", + "3 0.0 1.0 0.0 0.0\n", + "4 0.0 1.0 0.0 1.0\n", + ".. ... ... ... ...\n", + "886 0.0 1.0 0.0 1.0\n", + "887 0.0 1.0 0.0 0.0\n", + "888 0.0 1.0 0.0 0.0\n", + "889 0.0 0.0 0.0 1.0\n", + "890 1.0 0.0 0.0 1.0\n", + "\n", + "[891 rows x 4 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.preprocessing import OneHotEncoder\n", + "import numpy as np\n", + "\n", + "encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n", + "\n", + "encoded_values = encoder.fit_transform(titanic[[\"Embarked\", \"Sex\"]])\n", + "\n", + "encoded_columns = encoder.get_feature_names_out([\"Embarked\", \"Sex\"])\n", + "\n", + "encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n", + "\n", + "encoded_values_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Добавление признаков в исходный Dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedEmbarked_QEmbarked_SEmbarked_nanSex_male
10.03.0Braund, Mr. Owen Harrismale22.01.00.0A/5 211717.2500NaNS0.00.00.00.0
21.01.0Cumings, Mrs. John Bradley (Florence Briggs Th...female38.01.00.0PC 1759971.2833C85C0.01.00.00.0
31.03.0Heikkinen, Miss. Lainafemale26.00.00.0STON/O2. 31012827.9250NaNS0.01.00.00.0
41.01.0Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01.00.011380353.1000C123S0.01.00.01.0
50.03.0Allen, Mr. William Henrymale35.00.00.03734508.0500NaNS1.00.00.01.0
................................................
8881.01.0Graham, Miss. Margaret Edithfemale19.00.00.011205330.0000B42S0.01.00.00.0
8890.03.0Johnston, Miss. Catherine Helen \"Carrie\"femaleNaN1.02.0W./C. 660723.4500NaNS0.00.00.01.0
8901.01.0Behr, Mr. Karl Howellmale26.00.00.011136930.0000C148C1.00.00.01.0
8910.03.0Dooley, Mr. Patrickmale32.00.00.03703767.7500NaNQNaNNaNNaNNaN
0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0.01.00.01.0
\n", + "

892 rows × 15 columns

\n", + "
" + ], + "text/plain": [ + " Survived Pclass Name \\\n", + "1 0.0 3.0 Braund, Mr. Owen Harris \n", + "2 1.0 1.0 Cumings, Mrs. John Bradley (Florence Briggs Th... \n", + "3 1.0 3.0 Heikkinen, Miss. Laina \n", + "4 1.0 1.0 Futrelle, Mrs. Jacques Heath (Lily May Peel) \n", + "5 0.0 3.0 Allen, Mr. William Henry \n", + ".. ... ... ... \n", + "888 1.0 1.0 Graham, Miss. Margaret Edith \n", + "889 0.0 3.0 Johnston, Miss. Catherine Helen \"Carrie\" \n", + "890 1.0 1.0 Behr, Mr. Karl Howell \n", + "891 0.0 3.0 Dooley, Mr. Patrick \n", + "0 NaN NaN NaN \n", + "\n", + " Sex Age SibSp Parch Ticket Fare Cabin Embarked \\\n", + "1 male 22.0 1.0 0.0 A/5 21171 7.2500 NaN S \n", + "2 female 38.0 1.0 0.0 PC 17599 71.2833 C85 C \n", + "3 female 26.0 0.0 0.0 STON/O2. 3101282 7.9250 NaN S \n", + "4 female 35.0 1.0 0.0 113803 53.1000 C123 S \n", + "5 male 35.0 0.0 0.0 373450 8.0500 NaN S \n", + ".. ... ... ... ... ... ... ... ... \n", + "888 female 19.0 0.0 0.0 112053 30.0000 B42 S \n", + "889 female NaN 1.0 2.0 W./C. 6607 23.4500 NaN S \n", + "890 male 26.0 0.0 0.0 111369 30.0000 C148 C \n", + "891 male 32.0 0.0 0.0 370376 7.7500 NaN Q \n", + "0 NaN NaN NaN NaN NaN NaN NaN NaN \n", + "\n", + " Embarked_Q Embarked_S Embarked_nan Sex_male \n", + "1 0.0 0.0 0.0 0.0 \n", + "2 0.0 1.0 0.0 0.0 \n", + "3 0.0 1.0 0.0 0.0 \n", + "4 0.0 1.0 0.0 1.0 \n", + "5 1.0 0.0 0.0 1.0 \n", + ".. ... ... ... ... \n", + "888 0.0 1.0 0.0 0.0 \n", + "889 0.0 0.0 0.0 1.0 \n", + "890 1.0 0.0 0.0 1.0 \n", + "891 NaN NaN NaN NaN \n", + "0 0.0 1.0 0.0 1.0 \n", + "\n", + "[892 rows x 15 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic = pd.concat([titanic, encoded_values_df], axis=1)\n", + "\n", + "titanic" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Дискретизация признаков" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Равномерное разделение данных на 3 группы" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "labels = [\"young\", \"middle-aged\", \"old\"]\n", + "num_bins = 3" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([ 0.42 , 26.94666667, 53.47333333, 80. ]),\n", + " array([319, 523, 50]))" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hist1, bins1 = np.histogram(titanic[\"Age\"].fillna(titanic[\"Age\"].median()), bins=num_bins)\n", + "bins1, hist1" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeAge
122.0(0.42, 26.947]
238.0(26.947, 53.473]
326.0(0.42, 26.947]
435.0(26.947, 53.473]
535.0(26.947, 53.473]
6NaNNaN
754.0(53.473, 80.0]
82.0(0.42, 26.947]
927.0(26.947, 53.473]
1014.0(0.42, 26.947]
114.0(0.42, 26.947]
1258.0(53.473, 80.0]
1320.0(0.42, 26.947]
1439.0(26.947, 53.473]
1514.0(0.42, 26.947]
1655.0(53.473, 80.0]
172.0(0.42, 26.947]
18NaNNaN
1931.0(26.947, 53.473]
20NaNNaN
\n", + "
" + ], + "text/plain": [ + " Age Age\n", + "1 22.0 (0.42, 26.947]\n", + "2 38.0 (26.947, 53.473]\n", + "3 26.0 (0.42, 26.947]\n", + "4 35.0 (26.947, 53.473]\n", + "5 35.0 (26.947, 53.473]\n", + "6 NaN NaN\n", + "7 54.0 (53.473, 80.0]\n", + "8 2.0 (0.42, 26.947]\n", + "9 27.0 (26.947, 53.473]\n", + "10 14.0 (0.42, 26.947]\n", + "11 4.0 (0.42, 26.947]\n", + "12 58.0 (53.473, 80.0]\n", + "13 20.0 (0.42, 26.947]\n", + "14 39.0 (26.947, 53.473]\n", + "15 14.0 (0.42, 26.947]\n", + "16 55.0 (53.473, 80.0]\n", + "17 2.0 (0.42, 26.947]\n", + "18 NaN NaN\n", + "19 31.0 (26.947, 53.473]\n", + "20 NaN NaN" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins1))], axis=1).head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeAge
122.0young
238.0middle-aged
326.0young
435.0middle-aged
535.0middle-aged
6NaNNaN
754.0old
82.0young
927.0middle-aged
1014.0young
114.0young
1258.0old
1320.0young
1439.0middle-aged
1514.0young
1655.0old
172.0young
18NaNNaN
1931.0middle-aged
20NaNNaN
\n", + "
" + ], + "text/plain": [ + " Age Age\n", + "1 22.0 young\n", + "2 38.0 middle-aged\n", + "3 26.0 young\n", + "4 35.0 middle-aged\n", + "5 35.0 middle-aged\n", + "6 NaN NaN\n", + "7 54.0 old\n", + "8 2.0 young\n", + "9 27.0 middle-aged\n", + "10 14.0 young\n", + "11 4.0 young\n", + "12 58.0 old\n", + "13 20.0 young\n", + "14 39.0 middle-aged\n", + "15 14.0 young\n", + "16 55.0 old\n", + "17 2.0 young\n", + "18 NaN NaN\n", + "19 31.0 middle-aged\n", + "20 NaN NaN" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins1), labels=labels)], axis=1).head(20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Равномерное разделение данных на 3 группы c установкой собственной границы диапазона значений (от 0 до 100)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([ 0. , 33.33333333, 66.66666667, 100. ]),\n", + " array([641, 244, 7]))" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bins2 = np.linspace(0, 100, 4)\n", + "tmp_bins2 = np.digitize(titanic[\"Age\"].fillna(titanic[\"Age\"].median()), bins2)\n", + "hist2 = np.bincount(tmp_bins2 - 1)\n", + "bins2, hist2" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeAge
122.0(0.0, 33.333]
238.0(33.333, 66.667]
326.0(0.0, 33.333]
435.0(33.333, 66.667]
535.0(33.333, 66.667]
6NaNNaN
754.0(33.333, 66.667]
82.0(0.0, 33.333]
927.0(0.0, 33.333]
1014.0(0.0, 33.333]
114.0(0.0, 33.333]
1258.0(33.333, 66.667]
1320.0(0.0, 33.333]
1439.0(33.333, 66.667]
1514.0(0.0, 33.333]
1655.0(33.333, 66.667]
172.0(0.0, 33.333]
18NaNNaN
1931.0(0.0, 33.333]
20NaNNaN
\n", + "
" + ], + "text/plain": [ + " Age Age\n", + "1 22.0 (0.0, 33.333]\n", + "2 38.0 (33.333, 66.667]\n", + "3 26.0 (0.0, 33.333]\n", + "4 35.0 (33.333, 66.667]\n", + "5 35.0 (33.333, 66.667]\n", + "6 NaN NaN\n", + "7 54.0 (33.333, 66.667]\n", + "8 2.0 (0.0, 33.333]\n", + "9 27.0 (0.0, 33.333]\n", + "10 14.0 (0.0, 33.333]\n", + "11 4.0 (0.0, 33.333]\n", + "12 58.0 (33.333, 66.667]\n", + "13 20.0 (0.0, 33.333]\n", + "14 39.0 (33.333, 66.667]\n", + "15 14.0 (0.0, 33.333]\n", + "16 55.0 (33.333, 66.667]\n", + "17 2.0 (0.0, 33.333]\n", + "18 NaN NaN\n", + "19 31.0 (0.0, 33.333]\n", + "20 NaN NaN" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins2))], axis=1).head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeAge
122.0young
238.0middle-aged
326.0young
435.0middle-aged
535.0middle-aged
6NaNNaN
754.0middle-aged
82.0young
927.0young
1014.0young
114.0young
1258.0middle-aged
1320.0young
1439.0middle-aged
1514.0young
1655.0middle-aged
172.0young
18NaNNaN
1931.0young
20NaNNaN
\n", + "
" + ], + "text/plain": [ + " Age Age\n", + "1 22.0 young\n", + "2 38.0 middle-aged\n", + "3 26.0 young\n", + "4 35.0 middle-aged\n", + "5 35.0 middle-aged\n", + "6 NaN NaN\n", + "7 54.0 middle-aged\n", + "8 2.0 young\n", + "9 27.0 young\n", + "10 14.0 young\n", + "11 4.0 young\n", + "12 58.0 middle-aged\n", + "13 20.0 young\n", + "14 39.0 middle-aged\n", + "15 14.0 young\n", + "16 55.0 middle-aged\n", + "17 2.0 young\n", + "18 NaN NaN\n", + "19 31.0 young\n", + "20 NaN NaN" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins2), labels=labels)], axis=1).head(20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Равномерное разделение данных на 3 группы c установкой собственных интервалов (0 - 39, 40 - 60, 61 - 100)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([ 0, 40, 60, 100]), array([729, 137, 26]))" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hist3, bins3 = np.histogram(\n", + " titanic[\"Age\"].fillna(titanic[\"Age\"].median()), bins=[0, 40, 60, 100]\n", + ")\n", + "bins3, hist3" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeAge
122.0(0.0, 40.0]
238.0(0.0, 40.0]
326.0(0.0, 40.0]
435.0(0.0, 40.0]
535.0(0.0, 40.0]
6NaNNaN
754.0(40.0, 60.0]
82.0(0.0, 40.0]
927.0(0.0, 40.0]
1014.0(0.0, 40.0]
114.0(0.0, 40.0]
1258.0(40.0, 60.0]
1320.0(0.0, 40.0]
1439.0(0.0, 40.0]
1514.0(0.0, 40.0]
1655.0(40.0, 60.0]
172.0(0.0, 40.0]
18NaNNaN
1931.0(0.0, 40.0]
20NaNNaN
\n", + "
" + ], + "text/plain": [ + " Age Age\n", + "1 22.0 (0.0, 40.0]\n", + "2 38.0 (0.0, 40.0]\n", + "3 26.0 (0.0, 40.0]\n", + "4 35.0 (0.0, 40.0]\n", + "5 35.0 (0.0, 40.0]\n", + "6 NaN NaN\n", + "7 54.0 (40.0, 60.0]\n", + "8 2.0 (0.0, 40.0]\n", + "9 27.0 (0.0, 40.0]\n", + "10 14.0 (0.0, 40.0]\n", + "11 4.0 (0.0, 40.0]\n", + "12 58.0 (40.0, 60.0]\n", + "13 20.0 (0.0, 40.0]\n", + "14 39.0 (0.0, 40.0]\n", + "15 14.0 (0.0, 40.0]\n", + "16 55.0 (40.0, 60.0]\n", + "17 2.0 (0.0, 40.0]\n", + "18 NaN NaN\n", + "19 31.0 (0.0, 40.0]\n", + "20 NaN NaN" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins3))], axis=1).head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeAge
122.0young
238.0young
326.0young
435.0young
535.0young
6NaNNaN
754.0middle-aged
82.0young
927.0young
1014.0young
114.0young
1258.0middle-aged
1320.0young
1439.0young
1514.0young
1655.0middle-aged
172.0young
18NaNNaN
1931.0young
20NaNNaN
\n", + "
" + ], + "text/plain": [ + " Age Age\n", + "1 22.0 young\n", + "2 38.0 young\n", + "3 26.0 young\n", + "4 35.0 young\n", + "5 35.0 young\n", + "6 NaN NaN\n", + "7 54.0 middle-aged\n", + "8 2.0 young\n", + "9 27.0 young\n", + "10 14.0 young\n", + "11 4.0 young\n", + "12 58.0 middle-aged\n", + "13 20.0 young\n", + "14 39.0 young\n", + "15 14.0 young\n", + "16 55.0 middle-aged\n", + "17 2.0 young\n", + "18 NaN NaN\n", + "19 31.0 young\n", + "20 NaN NaN" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins3), labels=labels)], axis=1).head(20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Квантильное разделение данных на 3 группы" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeAge
122.00.0
238.02.0
326.01.0
435.02.0
535.02.0
6NaNNaN
754.02.0
82.00.0
927.01.0
1014.00.0
114.00.0
1258.02.0
1320.00.0
1439.02.0
1514.00.0
1655.02.0
172.00.0
18NaNNaN
1931.01.0
20NaNNaN
\n", + "
" + ], + "text/plain": [ + " Age Age\n", + "1 22.0 0.0\n", + "2 38.0 2.0\n", + "3 26.0 1.0\n", + "4 35.0 2.0\n", + "5 35.0 2.0\n", + "6 NaN NaN\n", + "7 54.0 2.0\n", + "8 2.0 0.0\n", + "9 27.0 1.0\n", + "10 14.0 0.0\n", + "11 4.0 0.0\n", + "12 58.0 2.0\n", + "13 20.0 0.0\n", + "14 39.0 2.0\n", + "15 14.0 0.0\n", + "16 55.0 2.0\n", + "17 2.0 0.0\n", + "18 NaN NaN\n", + "19 31.0 1.0\n", + "20 NaN NaN" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([titanic[\"Age\"], pd.qcut(titanic[\"Age\"], q=3, labels=False)], axis=1).head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeAge
122.0young
238.0old
326.0middle-aged
435.0old
535.0old
6NaNNaN
754.0old
82.0young
927.0middle-aged
1014.0young
114.0young
1258.0old
1320.0young
1439.0old
1514.0young
1655.0old
172.0young
18NaNNaN
1931.0middle-aged
20NaNNaN
\n", + "
" + ], + "text/plain": [ + " Age Age\n", + "1 22.0 young\n", + "2 38.0 old\n", + "3 26.0 middle-aged\n", + "4 35.0 old\n", + "5 35.0 old\n", + "6 NaN NaN\n", + "7 54.0 old\n", + "8 2.0 young\n", + "9 27.0 middle-aged\n", + "10 14.0 young\n", + "11 4.0 young\n", + "12 58.0 old\n", + "13 20.0 young\n", + "14 39.0 old\n", + "15 14.0 young\n", + "16 55.0 old\n", + "17 2.0 young\n", + "18 NaN NaN\n", + "19 31.0 middle-aged\n", + "20 NaN NaN" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([titanic[\"Age\"], pd.qcut(titanic[\"Age\"], q=3, labels=labels)], axis=1).head(20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Пример использования библиотеки Featuretools для автоматического конструирования (синтеза) признаков\n", "\n", "https://featuretools.alteryx.com/en/stable/getting_started/using_entitysets.html" ] @@ -24,11 +2244,10 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", "import featuretools as ft\n", "from woodwork.logical_types import Categorical, Datetime\n", "\n", @@ -57,7 +2276,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -112,7 +2331,7 @@ " No relationships" ] }, - "execution_count": 52, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -189,7 +2408,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -209,7 +2428,7 @@ " order_items.seller_id -> sellers.seller_id" ] }, - "execution_count": 53, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -229,14 +2448,14 @@ "source": [ "#### Автоматическое конструирование признаков с помощью featuretools\n", "\n", - "Библиотека применят различные функции агрегации к атрибутам таблицы order_items с учетом отношений\n", + "Библиотека применят различные функции агрегации и трансформации к атрибутам таблицы order_items с учетом отношений\n", "\n", "Результат помещается в Dataframe feature_matrix" ] }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -247,11 +2466,11 @@ " agg_primitives: ['any', 'mode']\n", "This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.\n", " warnings.warn(warning_msg, UnusedPrimitiveWarning)\n", - "c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n", + "c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n", " ).agg(to_agg)\n", - "c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n", + "c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n", " ).agg(to_agg)\n", - "c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n", + "c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n", " ).agg(to_agg)\n" ] }, @@ -765,7 +2984,7 @@ "[115 rows x 43 columns]" ] }, - "execution_count": 54, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -793,7 +3012,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -844,7 +3063,7 @@ " ]" ] }, - "execution_count": 55, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -852,6 +3071,864 @@ "source": [ "feature_defs" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Отсечение значений признаков" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Определение выбросов с помощью boxplot" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 148, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic.boxplot(column=\"Age\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Отсечение данных для признака Возраст, значение которых больше 65 лет" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameAgeAgeClip
34Wheadon, Mr. Edward H66.065.0
97Goldschmidt, Mr. George B71.065.0
117Connors, Mr. Patrick70.565.0
494Artagaveytia, Mr. Ramon71.065.0
631Barkworth, Mr. Algernon Henry Wilson80.065.0
673Mitchell, Mr. Henry Michael70.065.0
746Crosby, Capt. Edward Gifford70.065.0
852Svensson, Mr. Johan74.065.0
\n", + "
" + ], + "text/plain": [ + " Name Age AgeClip\n", + "34 Wheadon, Mr. Edward H 66.0 65.0\n", + "97 Goldschmidt, Mr. George B 71.0 65.0\n", + "117 Connors, Mr. Patrick 70.5 65.0\n", + "494 Artagaveytia, Mr. Ramon 71.0 65.0\n", + "631 Barkworth, Mr. Algernon Henry Wilson 80.0 65.0\n", + "673 Mitchell, Mr. Henry Michael 70.0 65.0\n", + "746 Crosby, Capt. Edward Gifford 70.0 65.0\n", + "852 Svensson, Mr. Johan 74.0 65.0" + ] + }, + "execution_count": 149, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_norm = titanic.copy()\n", + "\n", + "titanic_norm[\"AgeClip\"] = titanic[\"Age\"].clip(0, 65);\n", + "\n", + "titanic_norm[titanic_norm[\"Age\"] > 65][[\"Name\", \"Age\", \"AgeClip\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Винсоризация признака Возраст" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "56.0\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameAgeAgeWinsorize
34Wheadon, Mr. Edward H66.054.0
97Goldschmidt, Mr. George B71.054.0
117Connors, Mr. Patrick70.554.0
494Artagaveytia, Mr. Ramon71.054.0
631Barkworth, Mr. Algernon Henry Wilson80.054.0
673Mitchell, Mr. Henry Michael70.054.0
746Crosby, Capt. Edward Gifford70.054.0
852Svensson, Mr. Johan74.054.0
\n", + "
" + ], + "text/plain": [ + " Name Age AgeWinsorize\n", + "34 Wheadon, Mr. Edward H 66.0 54.0\n", + "97 Goldschmidt, Mr. George B 71.0 54.0\n", + "117 Connors, Mr. Patrick 70.5 54.0\n", + "494 Artagaveytia, Mr. Ramon 71.0 54.0\n", + "631 Barkworth, Mr. Algernon Henry Wilson 80.0 54.0\n", + "673 Mitchell, Mr. Henry Michael 70.0 54.0\n", + "746 Crosby, Capt. Edward Gifford 70.0 54.0\n", + "852 Svensson, Mr. Johan 74.0 54.0" + ] + }, + "execution_count": 150, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from scipy.stats.mstats import winsorize\n", + "\n", + "print(titanic_norm[\"Age\"].quantile(q=0.95))\n", + "\n", + "titanic_norm[\"AgeWinsorize\"] = winsorize(\n", + " titanic_norm[\"Age\"].fillna(titanic_norm[\"Age\"].mean()), (0, 0.05), inplace=False\n", + ")\n", + "\n", + "titanic_norm[titanic_norm[\"Age\"] > 65][[\"Name\", \"Age\", \"AgeWinsorize\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Нормализация значений" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameAgeAgeNormAgeClipNormAgeWinsorizeNormAgeWinsorizeNorm2
1Braund, Mr. Owen Harris22.00.2711740.3341590.402762-0.194476
2Cumings, Mrs. John Bradley (Florence Briggs Th...38.00.4722290.5819140.7013810.402762
3Heikkinen, Miss. Laina26.00.3214380.3960980.477417-0.045166
4Futrelle, Mrs. Jacques Heath (Lily May Peel)35.00.4345310.5354600.6453900.290780
5Allen, Mr. William Henry35.00.4345310.5354600.6453900.290780
6Moran, Mr. JamesNaNNaNNaN0.5464560.092912
7McCarthy, Mr. Timothy J54.00.6732850.8296691.0000001.000000
8Palsson, Master. Gosta Leonard2.00.0198540.0244660.029489-0.941023
9Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)27.00.3340040.4115830.496081-0.007839
10Nasser, Mrs. Nicholas (Adele Achem)14.00.1706460.2102820.253453-0.493094
11Sandstrom, Miss. Marguerite Rut4.00.0449860.0554350.066816-0.866368
12Bonnell, Miss. Elizabeth58.00.7235490.8916071.0000001.000000
13Saundercock, Mr. William Henry20.00.2460420.3031900.365435-0.269130
14Andersson, Mr. Anders Johan39.00.4847950.5973990.7200450.440090
15Vestrom, Miss. Hulda Amanda Adolfina14.00.1706460.2102820.253453-0.493094
16Hewlett, Mrs. (Mary D Kingcome)55.00.6858510.8451531.0000001.000000
17Rice, Master. Eugene2.00.0198540.0244660.029489-0.941023
18Williams, Mr. Charles EugeneNaNNaNNaN0.5464560.092912
19Vander Planke, Mrs. Julius (Emelia Maria Vande...31.00.3842670.4735210.5707350.141471
20Masselmani, Mrs. FatimaNaNNaNNaN0.5464560.092912
\n", + "
" + ], + "text/plain": [ + " Name Age AgeNorm \\\n", + "1 Braund, Mr. Owen Harris 22.0 0.271174 \n", + "2 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 0.472229 \n", + "3 Heikkinen, Miss. Laina 26.0 0.321438 \n", + "4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 0.434531 \n", + "5 Allen, Mr. William Henry 35.0 0.434531 \n", + "6 Moran, Mr. James NaN NaN \n", + "7 McCarthy, Mr. Timothy J 54.0 0.673285 \n", + "8 Palsson, Master. Gosta Leonard 2.0 0.019854 \n", + "9 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 27.0 0.334004 \n", + "10 Nasser, Mrs. Nicholas (Adele Achem) 14.0 0.170646 \n", + "11 Sandstrom, Miss. Marguerite Rut 4.0 0.044986 \n", + "12 Bonnell, Miss. Elizabeth 58.0 0.723549 \n", + "13 Saundercock, Mr. William Henry 20.0 0.246042 \n", + "14 Andersson, Mr. Anders Johan 39.0 0.484795 \n", + "15 Vestrom, Miss. Hulda Amanda Adolfina 14.0 0.170646 \n", + "16 Hewlett, Mrs. (Mary D Kingcome) 55.0 0.685851 \n", + "17 Rice, Master. Eugene 2.0 0.019854 \n", + "18 Williams, Mr. Charles Eugene NaN NaN \n", + "19 Vander Planke, Mrs. Julius (Emelia Maria Vande... 31.0 0.384267 \n", + "20 Masselmani, Mrs. Fatima NaN NaN \n", + "\n", + " AgeClipNorm AgeWinsorizeNorm AgeWinsorizeNorm2 \n", + "1 0.334159 0.402762 -0.194476 \n", + "2 0.581914 0.701381 0.402762 \n", + "3 0.396098 0.477417 -0.045166 \n", + "4 0.535460 0.645390 0.290780 \n", + "5 0.535460 0.645390 0.290780 \n", + "6 NaN 0.546456 0.092912 \n", + "7 0.829669 1.000000 1.000000 \n", + "8 0.024466 0.029489 -0.941023 \n", + "9 0.411583 0.496081 -0.007839 \n", + "10 0.210282 0.253453 -0.493094 \n", + "11 0.055435 0.066816 -0.866368 \n", + "12 0.891607 1.000000 1.000000 \n", + "13 0.303190 0.365435 -0.269130 \n", + "14 0.597399 0.720045 0.440090 \n", + "15 0.210282 0.253453 -0.493094 \n", + "16 0.845153 1.000000 1.000000 \n", + "17 0.024466 0.029489 -0.941023 \n", + "18 NaN 0.546456 0.092912 \n", + "19 0.473521 0.570735 0.141471 \n", + "20 NaN 0.546456 0.092912 " + ] + }, + "execution_count": 153, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn import preprocessing\n", + "\n", + "min_max_scaler = preprocessing.MinMaxScaler()\n", + "\n", + "min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))\n", + "\n", + "titanic_norm[\"AgeNorm\"] = min_max_scaler.fit_transform(\n", + " titanic_norm[\"Age\"].to_numpy().reshape(-1, 1)\n", + ").reshape(titanic_norm[\"Age\"].shape)\n", + "\n", + "titanic_norm[\"AgeClipNorm\"] = min_max_scaler.fit_transform(\n", + " titanic_norm[\"AgeClip\"].to_numpy().reshape(-1, 1)\n", + ").reshape(titanic_norm[\"Age\"].shape)\n", + "\n", + "titanic_norm[\"AgeWinsorizeNorm\"] = min_max_scaler.fit_transform(\n", + " titanic_norm[\"AgeWinsorize\"].to_numpy().reshape(-1, 1)\n", + ").reshape(titanic_norm[\"Age\"].shape)\n", + "\n", + "titanic_norm[\"AgeWinsorizeNorm2\"] = min_max_scaler_2.fit_transform(\n", + " titanic_norm[\"AgeWinsorize\"].to_numpy().reshape(-1, 1)\n", + ").reshape(titanic_norm[\"Age\"].shape)\n", + "\n", + "titanic_norm[\n", + " [\"Name\", \"Age\", \"AgeNorm\", \"AgeClipNorm\", \"AgeWinsorizeNorm\", \"AgeWinsorizeNorm2\"]\n", + "].head(20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Стандартизация значений" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameAgeAgeStandAgeClipStandAgeWinsorizeStand
1Braund, Mr. Owen Harris22.0-0.530377-0.532745-0.606602
2Cumings, Mrs. John Bradley (Florence Briggs Th...38.00.5718310.5850600.718863
3Heikkinen, Miss. Laina26.0-0.254825-0.253294-0.275236
4Futrelle, Mrs. Jacques Heath (Lily May Peel)35.00.3651670.3754720.470339
5Allen, Mr. William Henry35.00.3651670.3754720.470339
6Moran, Mr. JamesNaNNaNNaN0.031205
7McCarthy, Mr. Timothy J54.01.6740391.7028662.044329
8Palsson, Master. Gosta Leonard2.0-1.908136-1.930003-2.263435
9Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)27.0-0.185937-0.183431-0.192394
10Nasser, Mrs. Nicholas (Adele Achem)14.0-1.081480-1.091648-1.269335
11Sandstrom, Miss. Marguerite Rut4.0-1.770360-1.790277-2.097751
12Bonnell, Miss. Elizabeth58.01.9495911.9823172.044329
13Saundercock, Mr. William Henry20.0-0.668153-0.672471-0.772286
14Andersson, Mr. Anders Johan39.00.6407190.6549230.801705
15Vestrom, Miss. Hulda Amanda Adolfina14.0-1.081480-1.091648-1.269335
16Hewlett, Mrs. (Mary D Kingcome)55.01.7429271.7727292.044329
17Rice, Master. Eugene2.0-1.908136-1.930003-2.263435
18Williams, Mr. Charles EugeneNaNNaNNaN0.031205
19Vander Planke, Mrs. Julius (Emelia Maria Vande...31.00.0896150.0960200.138972
20Masselmani, Mrs. FatimaNaNNaNNaN0.031205
\n", + "
" + ], + "text/plain": [ + " Name Age AgeStand \\\n", + "1 Braund, Mr. Owen Harris 22.0 -0.530377 \n", + "2 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 0.571831 \n", + "3 Heikkinen, Miss. Laina 26.0 -0.254825 \n", + "4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 0.365167 \n", + "5 Allen, Mr. William Henry 35.0 0.365167 \n", + "6 Moran, Mr. James NaN NaN \n", + "7 McCarthy, Mr. Timothy J 54.0 1.674039 \n", + "8 Palsson, Master. Gosta Leonard 2.0 -1.908136 \n", + "9 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 27.0 -0.185937 \n", + "10 Nasser, Mrs. Nicholas (Adele Achem) 14.0 -1.081480 \n", + "11 Sandstrom, Miss. Marguerite Rut 4.0 -1.770360 \n", + "12 Bonnell, Miss. Elizabeth 58.0 1.949591 \n", + "13 Saundercock, Mr. William Henry 20.0 -0.668153 \n", + "14 Andersson, Mr. Anders Johan 39.0 0.640719 \n", + "15 Vestrom, Miss. Hulda Amanda Adolfina 14.0 -1.081480 \n", + "16 Hewlett, Mrs. (Mary D Kingcome) 55.0 1.742927 \n", + "17 Rice, Master. Eugene 2.0 -1.908136 \n", + "18 Williams, Mr. Charles Eugene NaN NaN \n", + "19 Vander Planke, Mrs. Julius (Emelia Maria Vande... 31.0 0.089615 \n", + "20 Masselmani, Mrs. Fatima NaN NaN \n", + "\n", + " AgeClipStand AgeWinsorizeStand \n", + "1 -0.532745 -0.606602 \n", + "2 0.585060 0.718863 \n", + "3 -0.253294 -0.275236 \n", + "4 0.375472 0.470339 \n", + "5 0.375472 0.470339 \n", + "6 NaN 0.031205 \n", + "7 1.702866 2.044329 \n", + "8 -1.930003 -2.263435 \n", + "9 -0.183431 -0.192394 \n", + "10 -1.091648 -1.269335 \n", + "11 -1.790277 -2.097751 \n", + "12 1.982317 2.044329 \n", + "13 -0.672471 -0.772286 \n", + "14 0.654923 0.801705 \n", + "15 -1.091648 -1.269335 \n", + "16 1.772729 2.044329 \n", + "17 -1.930003 -2.263435 \n", + "18 NaN 0.031205 \n", + "19 0.096020 0.138972 \n", + "20 NaN 0.031205 " + ] + }, + "execution_count": 152, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn import preprocessing\n", + "\n", + "stndart_scaler = preprocessing.StandardScaler()\n", + "\n", + "titanic_norm[\"AgeStand\"] = stndart_scaler.fit_transform(\n", + " titanic_norm[\"Age\"].to_numpy().reshape(-1, 1)\n", + ").reshape(titanic_norm[\"Age\"].shape)\n", + "\n", + "titanic_norm[\"AgeClipStand\"] = stndart_scaler.fit_transform(\n", + " titanic_norm[\"AgeClip\"].to_numpy().reshape(-1, 1)\n", + ").reshape(titanic_norm[\"Age\"].shape)\n", + "\n", + "titanic_norm[\"AgeWinsorizeStand\"] = stndart_scaler.fit_transform(\n", + " titanic_norm[\"AgeWinsorize\"].to_numpy().reshape(-1, 1)\n", + ").reshape(titanic_norm[\"Age\"].shape)\n", + "\n", + "titanic_norm[[\"Name\", \"Age\", \"AgeStand\", \"AgeClipStand\", \"AgeWinsorizeStand\"]].head(20)" + ] } ], "metadata": {