diff --git a/lec3.ipynb b/lec3.ipynb
index e76c2c8..25ba91c 100644
--- a/lec3.ipynb
+++ b/lec3.ipynb
@@ -4,7 +4,2227 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Пример использования библиотеки Featuretools для автоматического конструирования признаков\n",
+ "#### Унитарное кодирование\n",
+ "\n",
+ "Преобразование категориального признака в несколько бинарных признаков"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Загрузка набора данных Titanic"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Survived | \n",
+ " Pclass | \n",
+ " Name | \n",
+ " Sex | \n",
+ " Age | \n",
+ " SibSp | \n",
+ " Parch | \n",
+ " Ticket | \n",
+ " Fare | \n",
+ " Cabin | \n",
+ " Embarked | \n",
+ "
\n",
+ " \n",
+ " PassengerId | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Braund, Mr. Owen Harris | \n",
+ " male | \n",
+ " 22.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " A/5 21171 | \n",
+ " 7.2500 | \n",
+ " NaN | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
+ " female | \n",
+ " 38.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " PC 17599 | \n",
+ " 71.2833 | \n",
+ " C85 | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " Heikkinen, Miss. Laina | \n",
+ " female | \n",
+ " 26.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " STON/O2. 3101282 | \n",
+ " 7.9250 | \n",
+ " NaN | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n",
+ " female | \n",
+ " 35.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 113803 | \n",
+ " 53.1000 | \n",
+ " C123 | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Allen, Mr. William Henry | \n",
+ " male | \n",
+ " 35.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 373450 | \n",
+ " 8.0500 | \n",
+ " NaN | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 887 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " Montvila, Rev. Juozas | \n",
+ " male | \n",
+ " 27.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 211536 | \n",
+ " 13.0000 | \n",
+ " NaN | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ " 888 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Graham, Miss. Margaret Edith | \n",
+ " female | \n",
+ " 19.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 112053 | \n",
+ " 30.0000 | \n",
+ " B42 | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ " 889 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Johnston, Miss. Catherine Helen \"Carrie\" | \n",
+ " female | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " W./C. 6607 | \n",
+ " 23.4500 | \n",
+ " NaN | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ " 890 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Behr, Mr. Karl Howell | \n",
+ " male | \n",
+ " 26.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 111369 | \n",
+ " 30.0000 | \n",
+ " C148 | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " 891 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Dooley, Mr. Patrick | \n",
+ " male | \n",
+ " 32.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 370376 | \n",
+ " 7.7500 | \n",
+ " NaN | \n",
+ " Q | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
891 rows × 11 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Survived Pclass \\\n",
+ "PassengerId \n",
+ "1 0 3 \n",
+ "2 1 1 \n",
+ "3 1 3 \n",
+ "4 1 1 \n",
+ "5 0 3 \n",
+ "... ... ... \n",
+ "887 0 2 \n",
+ "888 1 1 \n",
+ "889 0 3 \n",
+ "890 1 1 \n",
+ "891 0 3 \n",
+ "\n",
+ " Name Sex Age \\\n",
+ "PassengerId \n",
+ "1 Braund, Mr. Owen Harris male 22.0 \n",
+ "2 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 \n",
+ "3 Heikkinen, Miss. Laina female 26.0 \n",
+ "4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 \n",
+ "5 Allen, Mr. William Henry male 35.0 \n",
+ "... ... ... ... \n",
+ "887 Montvila, Rev. Juozas male 27.0 \n",
+ "888 Graham, Miss. Margaret Edith female 19.0 \n",
+ "889 Johnston, Miss. Catherine Helen \"Carrie\" female NaN \n",
+ "890 Behr, Mr. Karl Howell male 26.0 \n",
+ "891 Dooley, Mr. Patrick male 32.0 \n",
+ "\n",
+ " SibSp Parch Ticket Fare Cabin Embarked \n",
+ "PassengerId \n",
+ "1 1 0 A/5 21171 7.2500 NaN S \n",
+ "2 1 0 PC 17599 71.2833 C85 C \n",
+ "3 0 0 STON/O2. 3101282 7.9250 NaN S \n",
+ "4 1 0 113803 53.1000 C123 S \n",
+ "5 0 0 373450 8.0500 NaN S \n",
+ "... ... ... ... ... ... ... \n",
+ "887 0 0 211536 13.0000 NaN S \n",
+ "888 0 0 112053 30.0000 B42 S \n",
+ "889 1 2 W./C. 6607 23.4500 NaN S \n",
+ "890 0 0 111369 30.0000 C148 C \n",
+ "891 0 0 370376 7.7500 NaN Q \n",
+ "\n",
+ "[891 rows x 11 columns]"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "titanic = pd.read_csv(\"data/titanic.csv\", index_col=\"PassengerId\")\n",
+ "\n",
+ "titanic"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Унитарное кодирование признаков Пол (Sex) и Порт посадки (Embarked)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Кодирование"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Embarked_Q | \n",
+ " Embarked_S | \n",
+ " Embarked_nan | \n",
+ " Sex_male | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 886 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 887 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 888 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 889 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 890 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
891 rows × 4 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Embarked_Q Embarked_S Embarked_nan Sex_male\n",
+ "0 0.0 1.0 0.0 1.0\n",
+ "1 0.0 0.0 0.0 0.0\n",
+ "2 0.0 1.0 0.0 0.0\n",
+ "3 0.0 1.0 0.0 0.0\n",
+ "4 0.0 1.0 0.0 1.0\n",
+ ".. ... ... ... ...\n",
+ "886 0.0 1.0 0.0 1.0\n",
+ "887 0.0 1.0 0.0 0.0\n",
+ "888 0.0 1.0 0.0 0.0\n",
+ "889 0.0 0.0 0.0 1.0\n",
+ "890 1.0 0.0 0.0 1.0\n",
+ "\n",
+ "[891 rows x 4 columns]"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.preprocessing import OneHotEncoder\n",
+ "import numpy as np\n",
+ "\n",
+ "encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
+ "\n",
+ "encoded_values = encoder.fit_transform(titanic[[\"Embarked\", \"Sex\"]])\n",
+ "\n",
+ "encoded_columns = encoder.get_feature_names_out([\"Embarked\", \"Sex\"])\n",
+ "\n",
+ "encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
+ "\n",
+ "encoded_values_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Добавление признаков в исходный Dataframe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Survived | \n",
+ " Pclass | \n",
+ " Name | \n",
+ " Sex | \n",
+ " Age | \n",
+ " SibSp | \n",
+ " Parch | \n",
+ " Ticket | \n",
+ " Fare | \n",
+ " Cabin | \n",
+ " Embarked | \n",
+ " Embarked_Q | \n",
+ " Embarked_S | \n",
+ " Embarked_nan | \n",
+ " Sex_male | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " Braund, Mr. Owen Harris | \n",
+ " male | \n",
+ " 22.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " A/5 21171 | \n",
+ " 7.2500 | \n",
+ " NaN | \n",
+ " S | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
+ " female | \n",
+ " 38.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " PC 17599 | \n",
+ " 71.2833 | \n",
+ " C85 | \n",
+ " C | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " Heikkinen, Miss. Laina | \n",
+ " female | \n",
+ " 26.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " STON/O2. 3101282 | \n",
+ " 7.9250 | \n",
+ " NaN | \n",
+ " S | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n",
+ " female | \n",
+ " 35.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 113803 | \n",
+ " 53.1000 | \n",
+ " C123 | \n",
+ " S | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " Allen, Mr. William Henry | \n",
+ " male | \n",
+ " 35.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 373450 | \n",
+ " 8.0500 | \n",
+ " NaN | \n",
+ " S | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 888 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " Graham, Miss. Margaret Edith | \n",
+ " female | \n",
+ " 19.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 112053 | \n",
+ " 30.0000 | \n",
+ " B42 | \n",
+ " S | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 889 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " Johnston, Miss. Catherine Helen \"Carrie\" | \n",
+ " female | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " W./C. 6607 | \n",
+ " 23.4500 | \n",
+ " NaN | \n",
+ " S | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 890 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " Behr, Mr. Karl Howell | \n",
+ " male | \n",
+ " 26.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 111369 | \n",
+ " 30.0000 | \n",
+ " C148 | \n",
+ " C | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 891 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " Dooley, Mr. Patrick | \n",
+ " male | \n",
+ " 32.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 370376 | \n",
+ " 7.7500 | \n",
+ " NaN | \n",
+ " Q | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
892 rows × 15 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Survived Pclass Name \\\n",
+ "1 0.0 3.0 Braund, Mr. Owen Harris \n",
+ "2 1.0 1.0 Cumings, Mrs. John Bradley (Florence Briggs Th... \n",
+ "3 1.0 3.0 Heikkinen, Miss. Laina \n",
+ "4 1.0 1.0 Futrelle, Mrs. Jacques Heath (Lily May Peel) \n",
+ "5 0.0 3.0 Allen, Mr. William Henry \n",
+ ".. ... ... ... \n",
+ "888 1.0 1.0 Graham, Miss. Margaret Edith \n",
+ "889 0.0 3.0 Johnston, Miss. Catherine Helen \"Carrie\" \n",
+ "890 1.0 1.0 Behr, Mr. Karl Howell \n",
+ "891 0.0 3.0 Dooley, Mr. Patrick \n",
+ "0 NaN NaN NaN \n",
+ "\n",
+ " Sex Age SibSp Parch Ticket Fare Cabin Embarked \\\n",
+ "1 male 22.0 1.0 0.0 A/5 21171 7.2500 NaN S \n",
+ "2 female 38.0 1.0 0.0 PC 17599 71.2833 C85 C \n",
+ "3 female 26.0 0.0 0.0 STON/O2. 3101282 7.9250 NaN S \n",
+ "4 female 35.0 1.0 0.0 113803 53.1000 C123 S \n",
+ "5 male 35.0 0.0 0.0 373450 8.0500 NaN S \n",
+ ".. ... ... ... ... ... ... ... ... \n",
+ "888 female 19.0 0.0 0.0 112053 30.0000 B42 S \n",
+ "889 female NaN 1.0 2.0 W./C. 6607 23.4500 NaN S \n",
+ "890 male 26.0 0.0 0.0 111369 30.0000 C148 C \n",
+ "891 male 32.0 0.0 0.0 370376 7.7500 NaN Q \n",
+ "0 NaN NaN NaN NaN NaN NaN NaN NaN \n",
+ "\n",
+ " Embarked_Q Embarked_S Embarked_nan Sex_male \n",
+ "1 0.0 0.0 0.0 0.0 \n",
+ "2 0.0 1.0 0.0 0.0 \n",
+ "3 0.0 1.0 0.0 0.0 \n",
+ "4 0.0 1.0 0.0 1.0 \n",
+ "5 1.0 0.0 0.0 1.0 \n",
+ ".. ... ... ... ... \n",
+ "888 0.0 1.0 0.0 0.0 \n",
+ "889 0.0 0.0 0.0 1.0 \n",
+ "890 1.0 0.0 0.0 1.0 \n",
+ "891 NaN NaN NaN NaN \n",
+ "0 0.0 1.0 0.0 1.0 \n",
+ "\n",
+ "[892 rows x 15 columns]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "titanic = pd.concat([titanic, encoded_values_df], axis=1)\n",
+ "\n",
+ "titanic"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Дискретизация признаков"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Равномерное разделение данных на 3 группы"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "labels = [\"young\", \"middle-aged\", \"old\"]\n",
+ "num_bins = 3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(array([ 0.42 , 26.94666667, 53.47333333, 80. ]),\n",
+ " array([319, 523, 50]))"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "hist1, bins1 = np.histogram(titanic[\"Age\"].fillna(titanic[\"Age\"].median()), bins=num_bins)\n",
+ "bins1, hist1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Age | \n",
+ " Age | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 22.0 | \n",
+ " (0.42, 26.947] | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 38.0 | \n",
+ " (26.947, 53.473] | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 26.0 | \n",
+ " (0.42, 26.947] | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 35.0 | \n",
+ " (26.947, 53.473] | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 35.0 | \n",
+ " (26.947, 53.473] | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 54.0 | \n",
+ " (53.473, 80.0] | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 2.0 | \n",
+ " (0.42, 26.947] | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 27.0 | \n",
+ " (26.947, 53.473] | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 14.0 | \n",
+ " (0.42, 26.947] | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 4.0 | \n",
+ " (0.42, 26.947] | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 58.0 | \n",
+ " (53.473, 80.0] | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 20.0 | \n",
+ " (0.42, 26.947] | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 39.0 | \n",
+ " (26.947, 53.473] | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 14.0 | \n",
+ " (0.42, 26.947] | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 55.0 | \n",
+ " (53.473, 80.0] | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 2.0 | \n",
+ " (0.42, 26.947] | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 31.0 | \n",
+ " (26.947, 53.473] | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Age Age\n",
+ "1 22.0 (0.42, 26.947]\n",
+ "2 38.0 (26.947, 53.473]\n",
+ "3 26.0 (0.42, 26.947]\n",
+ "4 35.0 (26.947, 53.473]\n",
+ "5 35.0 (26.947, 53.473]\n",
+ "6 NaN NaN\n",
+ "7 54.0 (53.473, 80.0]\n",
+ "8 2.0 (0.42, 26.947]\n",
+ "9 27.0 (26.947, 53.473]\n",
+ "10 14.0 (0.42, 26.947]\n",
+ "11 4.0 (0.42, 26.947]\n",
+ "12 58.0 (53.473, 80.0]\n",
+ "13 20.0 (0.42, 26.947]\n",
+ "14 39.0 (26.947, 53.473]\n",
+ "15 14.0 (0.42, 26.947]\n",
+ "16 55.0 (53.473, 80.0]\n",
+ "17 2.0 (0.42, 26.947]\n",
+ "18 NaN NaN\n",
+ "19 31.0 (26.947, 53.473]\n",
+ "20 NaN NaN"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins1))], axis=1).head(20)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Age | \n",
+ " Age | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 22.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 38.0 | \n",
+ " middle-aged | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 26.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 35.0 | \n",
+ " middle-aged | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 35.0 | \n",
+ " middle-aged | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 54.0 | \n",
+ " old | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 2.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 27.0 | \n",
+ " middle-aged | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 14.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 4.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 58.0 | \n",
+ " old | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 20.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 39.0 | \n",
+ " middle-aged | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 14.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 55.0 | \n",
+ " old | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 2.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 31.0 | \n",
+ " middle-aged | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Age Age\n",
+ "1 22.0 young\n",
+ "2 38.0 middle-aged\n",
+ "3 26.0 young\n",
+ "4 35.0 middle-aged\n",
+ "5 35.0 middle-aged\n",
+ "6 NaN NaN\n",
+ "7 54.0 old\n",
+ "8 2.0 young\n",
+ "9 27.0 middle-aged\n",
+ "10 14.0 young\n",
+ "11 4.0 young\n",
+ "12 58.0 old\n",
+ "13 20.0 young\n",
+ "14 39.0 middle-aged\n",
+ "15 14.0 young\n",
+ "16 55.0 old\n",
+ "17 2.0 young\n",
+ "18 NaN NaN\n",
+ "19 31.0 middle-aged\n",
+ "20 NaN NaN"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins1), labels=labels)], axis=1).head(20)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Равномерное разделение данных на 3 группы c установкой собственной границы диапазона значений (от 0 до 100)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(array([ 0. , 33.33333333, 66.66666667, 100. ]),\n",
+ " array([641, 244, 7]))"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "bins2 = np.linspace(0, 100, 4)\n",
+ "tmp_bins2 = np.digitize(titanic[\"Age\"].fillna(titanic[\"Age\"].median()), bins2)\n",
+ "hist2 = np.bincount(tmp_bins2 - 1)\n",
+ "bins2, hist2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Age | \n",
+ " Age | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 22.0 | \n",
+ " (0.0, 33.333] | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 38.0 | \n",
+ " (33.333, 66.667] | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 26.0 | \n",
+ " (0.0, 33.333] | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 35.0 | \n",
+ " (33.333, 66.667] | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 35.0 | \n",
+ " (33.333, 66.667] | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 54.0 | \n",
+ " (33.333, 66.667] | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 2.0 | \n",
+ " (0.0, 33.333] | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 27.0 | \n",
+ " (0.0, 33.333] | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 14.0 | \n",
+ " (0.0, 33.333] | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 4.0 | \n",
+ " (0.0, 33.333] | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 58.0 | \n",
+ " (33.333, 66.667] | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 20.0 | \n",
+ " (0.0, 33.333] | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 39.0 | \n",
+ " (33.333, 66.667] | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 14.0 | \n",
+ " (0.0, 33.333] | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 55.0 | \n",
+ " (33.333, 66.667] | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 2.0 | \n",
+ " (0.0, 33.333] | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 31.0 | \n",
+ " (0.0, 33.333] | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Age Age\n",
+ "1 22.0 (0.0, 33.333]\n",
+ "2 38.0 (33.333, 66.667]\n",
+ "3 26.0 (0.0, 33.333]\n",
+ "4 35.0 (33.333, 66.667]\n",
+ "5 35.0 (33.333, 66.667]\n",
+ "6 NaN NaN\n",
+ "7 54.0 (33.333, 66.667]\n",
+ "8 2.0 (0.0, 33.333]\n",
+ "9 27.0 (0.0, 33.333]\n",
+ "10 14.0 (0.0, 33.333]\n",
+ "11 4.0 (0.0, 33.333]\n",
+ "12 58.0 (33.333, 66.667]\n",
+ "13 20.0 (0.0, 33.333]\n",
+ "14 39.0 (33.333, 66.667]\n",
+ "15 14.0 (0.0, 33.333]\n",
+ "16 55.0 (33.333, 66.667]\n",
+ "17 2.0 (0.0, 33.333]\n",
+ "18 NaN NaN\n",
+ "19 31.0 (0.0, 33.333]\n",
+ "20 NaN NaN"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins2))], axis=1).head(20)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Age | \n",
+ " Age | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 22.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 38.0 | \n",
+ " middle-aged | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 26.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 35.0 | \n",
+ " middle-aged | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 35.0 | \n",
+ " middle-aged | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 54.0 | \n",
+ " middle-aged | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 2.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 27.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 14.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 4.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 58.0 | \n",
+ " middle-aged | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 20.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 39.0 | \n",
+ " middle-aged | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 14.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 55.0 | \n",
+ " middle-aged | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 2.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 31.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Age Age\n",
+ "1 22.0 young\n",
+ "2 38.0 middle-aged\n",
+ "3 26.0 young\n",
+ "4 35.0 middle-aged\n",
+ "5 35.0 middle-aged\n",
+ "6 NaN NaN\n",
+ "7 54.0 middle-aged\n",
+ "8 2.0 young\n",
+ "9 27.0 young\n",
+ "10 14.0 young\n",
+ "11 4.0 young\n",
+ "12 58.0 middle-aged\n",
+ "13 20.0 young\n",
+ "14 39.0 middle-aged\n",
+ "15 14.0 young\n",
+ "16 55.0 middle-aged\n",
+ "17 2.0 young\n",
+ "18 NaN NaN\n",
+ "19 31.0 young\n",
+ "20 NaN NaN"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins2), labels=labels)], axis=1).head(20)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Равномерное разделение данных на 3 группы c установкой собственных интервалов (0 - 39, 40 - 60, 61 - 100)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(array([ 0, 40, 60, 100]), array([729, 137, 26]))"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "hist3, bins3 = np.histogram(\n",
+ " titanic[\"Age\"].fillna(titanic[\"Age\"].median()), bins=[0, 40, 60, 100]\n",
+ ")\n",
+ "bins3, hist3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Age | \n",
+ " Age | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 22.0 | \n",
+ " (0.0, 40.0] | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 38.0 | \n",
+ " (0.0, 40.0] | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 26.0 | \n",
+ " (0.0, 40.0] | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 35.0 | \n",
+ " (0.0, 40.0] | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 35.0 | \n",
+ " (0.0, 40.0] | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 54.0 | \n",
+ " (40.0, 60.0] | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 2.0 | \n",
+ " (0.0, 40.0] | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 27.0 | \n",
+ " (0.0, 40.0] | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 14.0 | \n",
+ " (0.0, 40.0] | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 4.0 | \n",
+ " (0.0, 40.0] | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 58.0 | \n",
+ " (40.0, 60.0] | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 20.0 | \n",
+ " (0.0, 40.0] | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 39.0 | \n",
+ " (0.0, 40.0] | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 14.0 | \n",
+ " (0.0, 40.0] | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 55.0 | \n",
+ " (40.0, 60.0] | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 2.0 | \n",
+ " (0.0, 40.0] | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 31.0 | \n",
+ " (0.0, 40.0] | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Age Age\n",
+ "1 22.0 (0.0, 40.0]\n",
+ "2 38.0 (0.0, 40.0]\n",
+ "3 26.0 (0.0, 40.0]\n",
+ "4 35.0 (0.0, 40.0]\n",
+ "5 35.0 (0.0, 40.0]\n",
+ "6 NaN NaN\n",
+ "7 54.0 (40.0, 60.0]\n",
+ "8 2.0 (0.0, 40.0]\n",
+ "9 27.0 (0.0, 40.0]\n",
+ "10 14.0 (0.0, 40.0]\n",
+ "11 4.0 (0.0, 40.0]\n",
+ "12 58.0 (40.0, 60.0]\n",
+ "13 20.0 (0.0, 40.0]\n",
+ "14 39.0 (0.0, 40.0]\n",
+ "15 14.0 (0.0, 40.0]\n",
+ "16 55.0 (40.0, 60.0]\n",
+ "17 2.0 (0.0, 40.0]\n",
+ "18 NaN NaN\n",
+ "19 31.0 (0.0, 40.0]\n",
+ "20 NaN NaN"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins3))], axis=1).head(20)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Age | \n",
+ " Age | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 22.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 38.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 26.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 35.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 35.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 54.0 | \n",
+ " middle-aged | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 2.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 27.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 14.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 4.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 58.0 | \n",
+ " middle-aged | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 20.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 39.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 14.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 55.0 | \n",
+ " middle-aged | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 2.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 31.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Age Age\n",
+ "1 22.0 young\n",
+ "2 38.0 young\n",
+ "3 26.0 young\n",
+ "4 35.0 young\n",
+ "5 35.0 young\n",
+ "6 NaN NaN\n",
+ "7 54.0 middle-aged\n",
+ "8 2.0 young\n",
+ "9 27.0 young\n",
+ "10 14.0 young\n",
+ "11 4.0 young\n",
+ "12 58.0 middle-aged\n",
+ "13 20.0 young\n",
+ "14 39.0 young\n",
+ "15 14.0 young\n",
+ "16 55.0 middle-aged\n",
+ "17 2.0 young\n",
+ "18 NaN NaN\n",
+ "19 31.0 young\n",
+ "20 NaN NaN"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.concat([titanic[\"Age\"], pd.cut(titanic[\"Age\"], list(bins3), labels=labels)], axis=1).head(20)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Квантильное разделение данных на 3 группы"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Age | \n",
+ " Age | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 22.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 38.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 26.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 35.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 35.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 54.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 27.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 14.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 58.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 20.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 39.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 14.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 55.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 31.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Age Age\n",
+ "1 22.0 0.0\n",
+ "2 38.0 2.0\n",
+ "3 26.0 1.0\n",
+ "4 35.0 2.0\n",
+ "5 35.0 2.0\n",
+ "6 NaN NaN\n",
+ "7 54.0 2.0\n",
+ "8 2.0 0.0\n",
+ "9 27.0 1.0\n",
+ "10 14.0 0.0\n",
+ "11 4.0 0.0\n",
+ "12 58.0 2.0\n",
+ "13 20.0 0.0\n",
+ "14 39.0 2.0\n",
+ "15 14.0 0.0\n",
+ "16 55.0 2.0\n",
+ "17 2.0 0.0\n",
+ "18 NaN NaN\n",
+ "19 31.0 1.0\n",
+ "20 NaN NaN"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.concat([titanic[\"Age\"], pd.qcut(titanic[\"Age\"], q=3, labels=False)], axis=1).head(20)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Age | \n",
+ " Age | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 22.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 38.0 | \n",
+ " old | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 26.0 | \n",
+ " middle-aged | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 35.0 | \n",
+ " old | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 35.0 | \n",
+ " old | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 54.0 | \n",
+ " old | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 2.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 27.0 | \n",
+ " middle-aged | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 14.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 4.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 58.0 | \n",
+ " old | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 20.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 39.0 | \n",
+ " old | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 14.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 55.0 | \n",
+ " old | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 2.0 | \n",
+ " young | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 31.0 | \n",
+ " middle-aged | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Age Age\n",
+ "1 22.0 young\n",
+ "2 38.0 old\n",
+ "3 26.0 middle-aged\n",
+ "4 35.0 old\n",
+ "5 35.0 old\n",
+ "6 NaN NaN\n",
+ "7 54.0 old\n",
+ "8 2.0 young\n",
+ "9 27.0 middle-aged\n",
+ "10 14.0 young\n",
+ "11 4.0 young\n",
+ "12 58.0 old\n",
+ "13 20.0 young\n",
+ "14 39.0 old\n",
+ "15 14.0 young\n",
+ "16 55.0 old\n",
+ "17 2.0 young\n",
+ "18 NaN NaN\n",
+ "19 31.0 middle-aged\n",
+ "20 NaN NaN"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.concat([titanic[\"Age\"], pd.qcut(titanic[\"Age\"], q=3, labels=labels)], axis=1).head(20)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Пример использования библиотеки Featuretools для автоматического конструирования (синтеза) признаков\n",
"\n",
"https://featuretools.alteryx.com/en/stable/getting_started/using_entitysets.html"
]
@@ -24,11 +2244,10 @@
},
{
"cell_type": "code",
- "execution_count": 51,
+ "execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
- "import pandas as pd\n",
"import featuretools as ft\n",
"from woodwork.logical_types import Categorical, Datetime\n",
"\n",
@@ -57,7 +2276,7 @@
},
{
"cell_type": "code",
- "execution_count": 52,
+ "execution_count": 17,
"metadata": {},
"outputs": [
{
@@ -112,7 +2331,7 @@
" No relationships"
]
},
- "execution_count": 52,
+ "execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@@ -189,7 +2408,7 @@
},
{
"cell_type": "code",
- "execution_count": 53,
+ "execution_count": 18,
"metadata": {},
"outputs": [
{
@@ -209,7 +2428,7 @@
" order_items.seller_id -> sellers.seller_id"
]
},
- "execution_count": 53,
+ "execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
@@ -229,14 +2448,14 @@
"source": [
"#### Автоматическое конструирование признаков с помощью featuretools\n",
"\n",
- "Библиотека применят различные функции агрегации к атрибутам таблицы order_items с учетом отношений\n",
+ "Библиотека применят различные функции агрегации и трансформации к атрибутам таблицы order_items с учетом отношений\n",
"\n",
"Результат помещается в Dataframe feature_matrix"
]
},
{
"cell_type": "code",
- "execution_count": 54,
+ "execution_count": 19,
"metadata": {},
"outputs": [
{
@@ -247,11 +2466,11 @@
" agg_primitives: ['any', 'mode']\n",
"This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.\n",
" warnings.warn(warning_msg, UnusedPrimitiveWarning)\n",
- "c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
+ "c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
" ).agg(to_agg)\n",
- "c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
+ "c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
" ).agg(to_agg)\n",
- "c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
+ "c:\\Users\\user\\Projects\\python\\mai\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
" ).agg(to_agg)\n"
]
},
@@ -765,7 +2984,7 @@
"[115 rows x 43 columns]"
]
},
- "execution_count": 54,
+ "execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
@@ -793,7 +3012,7 @@
},
{
"cell_type": "code",
- "execution_count": 55,
+ "execution_count": 20,
"metadata": {},
"outputs": [
{
@@ -844,7 +3063,7 @@
" ]"
]
},
- "execution_count": 55,
+ "execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
@@ -852,6 +3071,864 @@
"source": [
"feature_defs"
]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Отсечение значений признаков"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Определение выбросов с помощью boxplot"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 148,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 148,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "titanic.boxplot(column=\"Age\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Отсечение данных для признака Возраст, значение которых больше 65 лет"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 149,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Name | \n",
+ " Age | \n",
+ " AgeClip | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 34 | \n",
+ " Wheadon, Mr. Edward H | \n",
+ " 66.0 | \n",
+ " 65.0 | \n",
+ "
\n",
+ " \n",
+ " 97 | \n",
+ " Goldschmidt, Mr. George B | \n",
+ " 71.0 | \n",
+ " 65.0 | \n",
+ "
\n",
+ " \n",
+ " 117 | \n",
+ " Connors, Mr. Patrick | \n",
+ " 70.5 | \n",
+ " 65.0 | \n",
+ "
\n",
+ " \n",
+ " 494 | \n",
+ " Artagaveytia, Mr. Ramon | \n",
+ " 71.0 | \n",
+ " 65.0 | \n",
+ "
\n",
+ " \n",
+ " 631 | \n",
+ " Barkworth, Mr. Algernon Henry Wilson | \n",
+ " 80.0 | \n",
+ " 65.0 | \n",
+ "
\n",
+ " \n",
+ " 673 | \n",
+ " Mitchell, Mr. Henry Michael | \n",
+ " 70.0 | \n",
+ " 65.0 | \n",
+ "
\n",
+ " \n",
+ " 746 | \n",
+ " Crosby, Capt. Edward Gifford | \n",
+ " 70.0 | \n",
+ " 65.0 | \n",
+ "
\n",
+ " \n",
+ " 852 | \n",
+ " Svensson, Mr. Johan | \n",
+ " 74.0 | \n",
+ " 65.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Name Age AgeClip\n",
+ "34 Wheadon, Mr. Edward H 66.0 65.0\n",
+ "97 Goldschmidt, Mr. George B 71.0 65.0\n",
+ "117 Connors, Mr. Patrick 70.5 65.0\n",
+ "494 Artagaveytia, Mr. Ramon 71.0 65.0\n",
+ "631 Barkworth, Mr. Algernon Henry Wilson 80.0 65.0\n",
+ "673 Mitchell, Mr. Henry Michael 70.0 65.0\n",
+ "746 Crosby, Capt. Edward Gifford 70.0 65.0\n",
+ "852 Svensson, Mr. Johan 74.0 65.0"
+ ]
+ },
+ "execution_count": 149,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "titanic_norm = titanic.copy()\n",
+ "\n",
+ "titanic_norm[\"AgeClip\"] = titanic[\"Age\"].clip(0, 65);\n",
+ "\n",
+ "titanic_norm[titanic_norm[\"Age\"] > 65][[\"Name\", \"Age\", \"AgeClip\"]]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Винсоризация признака Возраст"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 150,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "56.0\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Name | \n",
+ " Age | \n",
+ " AgeWinsorize | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 34 | \n",
+ " Wheadon, Mr. Edward H | \n",
+ " 66.0 | \n",
+ " 54.0 | \n",
+ "
\n",
+ " \n",
+ " 97 | \n",
+ " Goldschmidt, Mr. George B | \n",
+ " 71.0 | \n",
+ " 54.0 | \n",
+ "
\n",
+ " \n",
+ " 117 | \n",
+ " Connors, Mr. Patrick | \n",
+ " 70.5 | \n",
+ " 54.0 | \n",
+ "
\n",
+ " \n",
+ " 494 | \n",
+ " Artagaveytia, Mr. Ramon | \n",
+ " 71.0 | \n",
+ " 54.0 | \n",
+ "
\n",
+ " \n",
+ " 631 | \n",
+ " Barkworth, Mr. Algernon Henry Wilson | \n",
+ " 80.0 | \n",
+ " 54.0 | \n",
+ "
\n",
+ " \n",
+ " 673 | \n",
+ " Mitchell, Mr. Henry Michael | \n",
+ " 70.0 | \n",
+ " 54.0 | \n",
+ "
\n",
+ " \n",
+ " 746 | \n",
+ " Crosby, Capt. Edward Gifford | \n",
+ " 70.0 | \n",
+ " 54.0 | \n",
+ "
\n",
+ " \n",
+ " 852 | \n",
+ " Svensson, Mr. Johan | \n",
+ " 74.0 | \n",
+ " 54.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Name Age AgeWinsorize\n",
+ "34 Wheadon, Mr. Edward H 66.0 54.0\n",
+ "97 Goldschmidt, Mr. George B 71.0 54.0\n",
+ "117 Connors, Mr. Patrick 70.5 54.0\n",
+ "494 Artagaveytia, Mr. Ramon 71.0 54.0\n",
+ "631 Barkworth, Mr. Algernon Henry Wilson 80.0 54.0\n",
+ "673 Mitchell, Mr. Henry Michael 70.0 54.0\n",
+ "746 Crosby, Capt. Edward Gifford 70.0 54.0\n",
+ "852 Svensson, Mr. Johan 74.0 54.0"
+ ]
+ },
+ "execution_count": 150,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from scipy.stats.mstats import winsorize\n",
+ "\n",
+ "print(titanic_norm[\"Age\"].quantile(q=0.95))\n",
+ "\n",
+ "titanic_norm[\"AgeWinsorize\"] = winsorize(\n",
+ " titanic_norm[\"Age\"].fillna(titanic_norm[\"Age\"].mean()), (0, 0.05), inplace=False\n",
+ ")\n",
+ "\n",
+ "titanic_norm[titanic_norm[\"Age\"] > 65][[\"Name\", \"Age\", \"AgeWinsorize\"]]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Нормализация значений"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 153,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Name | \n",
+ " Age | \n",
+ " AgeNorm | \n",
+ " AgeClipNorm | \n",
+ " AgeWinsorizeNorm | \n",
+ " AgeWinsorizeNorm2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " Braund, Mr. Owen Harris | \n",
+ " 22.0 | \n",
+ " 0.271174 | \n",
+ " 0.334159 | \n",
+ " 0.402762 | \n",
+ " -0.194476 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
+ " 38.0 | \n",
+ " 0.472229 | \n",
+ " 0.581914 | \n",
+ " 0.701381 | \n",
+ " 0.402762 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Heikkinen, Miss. Laina | \n",
+ " 26.0 | \n",
+ " 0.321438 | \n",
+ " 0.396098 | \n",
+ " 0.477417 | \n",
+ " -0.045166 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n",
+ " 35.0 | \n",
+ " 0.434531 | \n",
+ " 0.535460 | \n",
+ " 0.645390 | \n",
+ " 0.290780 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Allen, Mr. William Henry | \n",
+ " 35.0 | \n",
+ " 0.434531 | \n",
+ " 0.535460 | \n",
+ " 0.645390 | \n",
+ " 0.290780 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " Moran, Mr. James | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.546456 | \n",
+ " 0.092912 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " McCarthy, Mr. Timothy J | \n",
+ " 54.0 | \n",
+ " 0.673285 | \n",
+ " 0.829669 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Palsson, Master. Gosta Leonard | \n",
+ " 2.0 | \n",
+ " 0.019854 | \n",
+ " 0.024466 | \n",
+ " 0.029489 | \n",
+ " -0.941023 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | \n",
+ " 27.0 | \n",
+ " 0.334004 | \n",
+ " 0.411583 | \n",
+ " 0.496081 | \n",
+ " -0.007839 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Nasser, Mrs. Nicholas (Adele Achem) | \n",
+ " 14.0 | \n",
+ " 0.170646 | \n",
+ " 0.210282 | \n",
+ " 0.253453 | \n",
+ " -0.493094 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Sandstrom, Miss. Marguerite Rut | \n",
+ " 4.0 | \n",
+ " 0.044986 | \n",
+ " 0.055435 | \n",
+ " 0.066816 | \n",
+ " -0.866368 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Bonnell, Miss. Elizabeth | \n",
+ " 58.0 | \n",
+ " 0.723549 | \n",
+ " 0.891607 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Saundercock, Mr. William Henry | \n",
+ " 20.0 | \n",
+ " 0.246042 | \n",
+ " 0.303190 | \n",
+ " 0.365435 | \n",
+ " -0.269130 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " Andersson, Mr. Anders Johan | \n",
+ " 39.0 | \n",
+ " 0.484795 | \n",
+ " 0.597399 | \n",
+ " 0.720045 | \n",
+ " 0.440090 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " Vestrom, Miss. Hulda Amanda Adolfina | \n",
+ " 14.0 | \n",
+ " 0.170646 | \n",
+ " 0.210282 | \n",
+ " 0.253453 | \n",
+ " -0.493094 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " Hewlett, Mrs. (Mary D Kingcome) | \n",
+ " 55.0 | \n",
+ " 0.685851 | \n",
+ " 0.845153 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " Rice, Master. Eugene | \n",
+ " 2.0 | \n",
+ " 0.019854 | \n",
+ " 0.024466 | \n",
+ " 0.029489 | \n",
+ " -0.941023 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " Williams, Mr. Charles Eugene | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.546456 | \n",
+ " 0.092912 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Vander Planke, Mrs. Julius (Emelia Maria Vande... | \n",
+ " 31.0 | \n",
+ " 0.384267 | \n",
+ " 0.473521 | \n",
+ " 0.570735 | \n",
+ " 0.141471 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " Masselmani, Mrs. Fatima | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.546456 | \n",
+ " 0.092912 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Name Age AgeNorm \\\n",
+ "1 Braund, Mr. Owen Harris 22.0 0.271174 \n",
+ "2 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 0.472229 \n",
+ "3 Heikkinen, Miss. Laina 26.0 0.321438 \n",
+ "4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 0.434531 \n",
+ "5 Allen, Mr. William Henry 35.0 0.434531 \n",
+ "6 Moran, Mr. James NaN NaN \n",
+ "7 McCarthy, Mr. Timothy J 54.0 0.673285 \n",
+ "8 Palsson, Master. Gosta Leonard 2.0 0.019854 \n",
+ "9 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 27.0 0.334004 \n",
+ "10 Nasser, Mrs. Nicholas (Adele Achem) 14.0 0.170646 \n",
+ "11 Sandstrom, Miss. Marguerite Rut 4.0 0.044986 \n",
+ "12 Bonnell, Miss. Elizabeth 58.0 0.723549 \n",
+ "13 Saundercock, Mr. William Henry 20.0 0.246042 \n",
+ "14 Andersson, Mr. Anders Johan 39.0 0.484795 \n",
+ "15 Vestrom, Miss. Hulda Amanda Adolfina 14.0 0.170646 \n",
+ "16 Hewlett, Mrs. (Mary D Kingcome) 55.0 0.685851 \n",
+ "17 Rice, Master. Eugene 2.0 0.019854 \n",
+ "18 Williams, Mr. Charles Eugene NaN NaN \n",
+ "19 Vander Planke, Mrs. Julius (Emelia Maria Vande... 31.0 0.384267 \n",
+ "20 Masselmani, Mrs. Fatima NaN NaN \n",
+ "\n",
+ " AgeClipNorm AgeWinsorizeNorm AgeWinsorizeNorm2 \n",
+ "1 0.334159 0.402762 -0.194476 \n",
+ "2 0.581914 0.701381 0.402762 \n",
+ "3 0.396098 0.477417 -0.045166 \n",
+ "4 0.535460 0.645390 0.290780 \n",
+ "5 0.535460 0.645390 0.290780 \n",
+ "6 NaN 0.546456 0.092912 \n",
+ "7 0.829669 1.000000 1.000000 \n",
+ "8 0.024466 0.029489 -0.941023 \n",
+ "9 0.411583 0.496081 -0.007839 \n",
+ "10 0.210282 0.253453 -0.493094 \n",
+ "11 0.055435 0.066816 -0.866368 \n",
+ "12 0.891607 1.000000 1.000000 \n",
+ "13 0.303190 0.365435 -0.269130 \n",
+ "14 0.597399 0.720045 0.440090 \n",
+ "15 0.210282 0.253453 -0.493094 \n",
+ "16 0.845153 1.000000 1.000000 \n",
+ "17 0.024466 0.029489 -0.941023 \n",
+ "18 NaN 0.546456 0.092912 \n",
+ "19 0.473521 0.570735 0.141471 \n",
+ "20 NaN 0.546456 0.092912 "
+ ]
+ },
+ "execution_count": 153,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn import preprocessing\n",
+ "\n",
+ "min_max_scaler = preprocessing.MinMaxScaler()\n",
+ "\n",
+ "min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))\n",
+ "\n",
+ "titanic_norm[\"AgeNorm\"] = min_max_scaler.fit_transform(\n",
+ " titanic_norm[\"Age\"].to_numpy().reshape(-1, 1)\n",
+ ").reshape(titanic_norm[\"Age\"].shape)\n",
+ "\n",
+ "titanic_norm[\"AgeClipNorm\"] = min_max_scaler.fit_transform(\n",
+ " titanic_norm[\"AgeClip\"].to_numpy().reshape(-1, 1)\n",
+ ").reshape(titanic_norm[\"Age\"].shape)\n",
+ "\n",
+ "titanic_norm[\"AgeWinsorizeNorm\"] = min_max_scaler.fit_transform(\n",
+ " titanic_norm[\"AgeWinsorize\"].to_numpy().reshape(-1, 1)\n",
+ ").reshape(titanic_norm[\"Age\"].shape)\n",
+ "\n",
+ "titanic_norm[\"AgeWinsorizeNorm2\"] = min_max_scaler_2.fit_transform(\n",
+ " titanic_norm[\"AgeWinsorize\"].to_numpy().reshape(-1, 1)\n",
+ ").reshape(titanic_norm[\"Age\"].shape)\n",
+ "\n",
+ "titanic_norm[\n",
+ " [\"Name\", \"Age\", \"AgeNorm\", \"AgeClipNorm\", \"AgeWinsorizeNorm\", \"AgeWinsorizeNorm2\"]\n",
+ "].head(20)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Стандартизация значений"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 152,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Name | \n",
+ " Age | \n",
+ " AgeStand | \n",
+ " AgeClipStand | \n",
+ " AgeWinsorizeStand | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " Braund, Mr. Owen Harris | \n",
+ " 22.0 | \n",
+ " -0.530377 | \n",
+ " -0.532745 | \n",
+ " -0.606602 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
+ " 38.0 | \n",
+ " 0.571831 | \n",
+ " 0.585060 | \n",
+ " 0.718863 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Heikkinen, Miss. Laina | \n",
+ " 26.0 | \n",
+ " -0.254825 | \n",
+ " -0.253294 | \n",
+ " -0.275236 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n",
+ " 35.0 | \n",
+ " 0.365167 | \n",
+ " 0.375472 | \n",
+ " 0.470339 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Allen, Mr. William Henry | \n",
+ " 35.0 | \n",
+ " 0.365167 | \n",
+ " 0.375472 | \n",
+ " 0.470339 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " Moran, Mr. James | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.031205 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " McCarthy, Mr. Timothy J | \n",
+ " 54.0 | \n",
+ " 1.674039 | \n",
+ " 1.702866 | \n",
+ " 2.044329 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Palsson, Master. Gosta Leonard | \n",
+ " 2.0 | \n",
+ " -1.908136 | \n",
+ " -1.930003 | \n",
+ " -2.263435 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | \n",
+ " 27.0 | \n",
+ " -0.185937 | \n",
+ " -0.183431 | \n",
+ " -0.192394 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Nasser, Mrs. Nicholas (Adele Achem) | \n",
+ " 14.0 | \n",
+ " -1.081480 | \n",
+ " -1.091648 | \n",
+ " -1.269335 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Sandstrom, Miss. Marguerite Rut | \n",
+ " 4.0 | \n",
+ " -1.770360 | \n",
+ " -1.790277 | \n",
+ " -2.097751 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Bonnell, Miss. Elizabeth | \n",
+ " 58.0 | \n",
+ " 1.949591 | \n",
+ " 1.982317 | \n",
+ " 2.044329 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Saundercock, Mr. William Henry | \n",
+ " 20.0 | \n",
+ " -0.668153 | \n",
+ " -0.672471 | \n",
+ " -0.772286 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " Andersson, Mr. Anders Johan | \n",
+ " 39.0 | \n",
+ " 0.640719 | \n",
+ " 0.654923 | \n",
+ " 0.801705 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " Vestrom, Miss. Hulda Amanda Adolfina | \n",
+ " 14.0 | \n",
+ " -1.081480 | \n",
+ " -1.091648 | \n",
+ " -1.269335 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " Hewlett, Mrs. (Mary D Kingcome) | \n",
+ " 55.0 | \n",
+ " 1.742927 | \n",
+ " 1.772729 | \n",
+ " 2.044329 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " Rice, Master. Eugene | \n",
+ " 2.0 | \n",
+ " -1.908136 | \n",
+ " -1.930003 | \n",
+ " -2.263435 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " Williams, Mr. Charles Eugene | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.031205 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Vander Planke, Mrs. Julius (Emelia Maria Vande... | \n",
+ " 31.0 | \n",
+ " 0.089615 | \n",
+ " 0.096020 | \n",
+ " 0.138972 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " Masselmani, Mrs. Fatima | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.031205 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Name Age AgeStand \\\n",
+ "1 Braund, Mr. Owen Harris 22.0 -0.530377 \n",
+ "2 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 0.571831 \n",
+ "3 Heikkinen, Miss. Laina 26.0 -0.254825 \n",
+ "4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 0.365167 \n",
+ "5 Allen, Mr. William Henry 35.0 0.365167 \n",
+ "6 Moran, Mr. James NaN NaN \n",
+ "7 McCarthy, Mr. Timothy J 54.0 1.674039 \n",
+ "8 Palsson, Master. Gosta Leonard 2.0 -1.908136 \n",
+ "9 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 27.0 -0.185937 \n",
+ "10 Nasser, Mrs. Nicholas (Adele Achem) 14.0 -1.081480 \n",
+ "11 Sandstrom, Miss. Marguerite Rut 4.0 -1.770360 \n",
+ "12 Bonnell, Miss. Elizabeth 58.0 1.949591 \n",
+ "13 Saundercock, Mr. William Henry 20.0 -0.668153 \n",
+ "14 Andersson, Mr. Anders Johan 39.0 0.640719 \n",
+ "15 Vestrom, Miss. Hulda Amanda Adolfina 14.0 -1.081480 \n",
+ "16 Hewlett, Mrs. (Mary D Kingcome) 55.0 1.742927 \n",
+ "17 Rice, Master. Eugene 2.0 -1.908136 \n",
+ "18 Williams, Mr. Charles Eugene NaN NaN \n",
+ "19 Vander Planke, Mrs. Julius (Emelia Maria Vande... 31.0 0.089615 \n",
+ "20 Masselmani, Mrs. Fatima NaN NaN \n",
+ "\n",
+ " AgeClipStand AgeWinsorizeStand \n",
+ "1 -0.532745 -0.606602 \n",
+ "2 0.585060 0.718863 \n",
+ "3 -0.253294 -0.275236 \n",
+ "4 0.375472 0.470339 \n",
+ "5 0.375472 0.470339 \n",
+ "6 NaN 0.031205 \n",
+ "7 1.702866 2.044329 \n",
+ "8 -1.930003 -2.263435 \n",
+ "9 -0.183431 -0.192394 \n",
+ "10 -1.091648 -1.269335 \n",
+ "11 -1.790277 -2.097751 \n",
+ "12 1.982317 2.044329 \n",
+ "13 -0.672471 -0.772286 \n",
+ "14 0.654923 0.801705 \n",
+ "15 -1.091648 -1.269335 \n",
+ "16 1.772729 2.044329 \n",
+ "17 -1.930003 -2.263435 \n",
+ "18 NaN 0.031205 \n",
+ "19 0.096020 0.138972 \n",
+ "20 NaN 0.031205 "
+ ]
+ },
+ "execution_count": 152,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn import preprocessing\n",
+ "\n",
+ "stndart_scaler = preprocessing.StandardScaler()\n",
+ "\n",
+ "titanic_norm[\"AgeStand\"] = stndart_scaler.fit_transform(\n",
+ " titanic_norm[\"Age\"].to_numpy().reshape(-1, 1)\n",
+ ").reshape(titanic_norm[\"Age\"].shape)\n",
+ "\n",
+ "titanic_norm[\"AgeClipStand\"] = stndart_scaler.fit_transform(\n",
+ " titanic_norm[\"AgeClip\"].to_numpy().reshape(-1, 1)\n",
+ ").reshape(titanic_norm[\"Age\"].shape)\n",
+ "\n",
+ "titanic_norm[\"AgeWinsorizeStand\"] = stndart_scaler.fit_transform(\n",
+ " titanic_norm[\"AgeWinsorize\"].to_numpy().reshape(-1, 1)\n",
+ ").reshape(titanic_norm[\"Age\"].shape)\n",
+ "\n",
+ "titanic_norm[[\"Name\", \"Age\", \"AgeStand\", \"AgeClipStand\", \"AgeWinsorizeStand\"]].head(20)"
+ ]
}
],
"metadata": {