коммит2

2024-11-15 22:37:33 +04:00 · 2024-11-15 22:37:33 +04:00 · 8fc280c7e7
commit 8fc280c7e7
parent 41c4ab91ed
1 changed files with 761 additions and 123 deletions
--- a/lab_4/lab4.ipynb
+++ b/lab_4/lab4.ipynb
@ -33,7 +33,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# Подготовим данные: удалим колонки rank и name(в них уникальные значения, которые не участвуют в предсказаниях). А также преобразуем номинальные колонки в числовые(country, source, industry) и категоризируем колонку age"
+    "# Подготовим данные: категоризируем колонку age"
   ]
  },
  {
@ -45,167 +45,676 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "   Networth  Age        Country              Source                Industry\n",
-      "0     219.0   50  United States       Tesla, SpaceX             Automotive \n",
-      "1     171.0   58  United States              Amazon             Technology \n",
-      "2     158.0   73         France                LVMH       Fashion & Retail \n",
-      "3     129.0   66  United States           Microsoft             Technology \n",
-      "4     118.0   91  United States  Berkshire Hathaway  Finance & Investments \n"
+      "Rank        0\n",
+      "Name        0\n",
+      "Networth    0\n",
+      "Age         0\n",
+      "Country     0\n",
+      "Source      0\n",
+      "Industry    0\n",
+      "dtype: int64\n",
+      "\n",
+      "Rank        False\n",
+      "Name        False\n",
+      "Networth    False\n",
+      "Age         False\n",
+      "Country     False\n",
+      "Source      False\n",
+      "Industry    False\n",
+      "dtype: bool\n",
+      "\n"
     ]
    }
   ],
   "source": [
-    "# Удаление колонок 'rank' и 'name'\n",
-    "df.drop(columns=['Rank ', 'Name'], inplace=True)\n",
+    "print(df.isnull().sum())\n",
    "\n",
-    "# Проверка, что колонки были удалены\n",
-    "print(df.head())"
+    "print()\n",
+    "\n",
+    "# Есть ли пустые значения признаков\n",
+    "print(df.isnull().any())\n",
+    "\n",
+    "print()\n",
+    "\n",
+    "# Процент пустых значений признаков\n",
+    "for i in df.columns:\n",
+    "    null_rate = df[i].isnull().sum() / len(df) * 100\n",
+    "    if null_rate > 0:\n",
+    "        print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "   Networth  Country_Argentina  Country_Australia  Country_Austria  \\\n",
-      "0     219.0              False              False            False   \n",
-      "1     171.0              False              False            False   \n",
-      "2     158.0              False              False            False   \n",
-      "3     129.0              False              False            False   \n",
-      "4     118.0              False              False            False   \n",
+      "   Rank                        Name  Networth        Country  \\\n",
+      "0      1                 Elon Musk      219.0  United States   \n",
+      "1      2                Jeff Bezos      171.0  United States   \n",
+      "2      3  Bernard Arnault & family      158.0         France   \n",
+      "3      4                Bill Gates      129.0  United States   \n",
+      "4      5            Warren Buffett      118.0  United States   \n",
      "\n",
-      "   Country_Barbados  Country_Belgium  Country_Belize  Country_Brazil  \\\n",
-      "0             False            False           False           False   \n",
-      "1             False            False           False           False   \n",
-      "2             False            False           False           False   \n",
-      "3             False            False           False           False   \n",
-      "4             False            False           False           False   \n",
-      "\n",
-      "   Country_Bulgaria  Country_Canada  ...  wind  wine  winter  wire  wireless  \\\n",
-      "0             False           False  ...   0.0   0.0     0.0   0.0       0.0   \n",
-      "1             False           False  ...   0.0   0.0     0.0   0.0       0.0   \n",
-      "2             False           False  ...   0.0   0.0     0.0   0.0       0.0   \n",
-      "3             False           False  ...   0.0   0.0     0.0   0.0       0.0   \n",
-      "4             False           False  ...   0.0   0.0     0.0   0.0       0.0   \n",
-      "\n",
-      "   yahoo  yogurt  zara  zoom    Age  \n",
-      "0    0.0     0.0   0.0   0.0  50-60  \n",
-      "1    0.0     0.0   0.0   0.0  50-60  \n",
-      "2    0.0     0.0   0.0   0.0  70-80  \n",
-      "3    0.0     0.0   0.0   0.0  60-70  \n",
-      "4    0.0     0.0   0.0   0.0    80+  \n",
-      "\n",
-      "[5 rows x 828 columns]\n"
+      "               Source                Industry Age_category  \n",
+      "0       Tesla, SpaceX             Automotive         50-60  \n",
+      "1              Amazon             Technology         50-60  \n",
+      "2                LVMH       Fashion & Retail         70-80  \n",
+      "3           Microsoft             Technology         60-70  \n",
+      "4  Berkshire Hathaway  Finance & Investments           80+  \n"
     ]
    }
   ],
   "source": [
-    "from sklearn.preprocessing import OneHotEncoder\n",
-    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
-    "# Преобразуем 'country' и 'industry' в бинарные матрицы с помощью One-Hot Encoding\n",
-    "df_country = pd.get_dummies(df[['Country']], drop_first=True)\n",
-    "df_industry = pd.get_dummies(df[['Industry']], drop_first=True)\n",
    "\n",
-    "# Преобразуем колонку 'source' с помощью TF-IDF\n",
-    "tfidf_vectorizer = TfidfVectorizer(max_features=1000)  \n",
-    "X_tfidf = tfidf_vectorizer.fit_transform(df['Source']).toarray()\n",
-    "\n",
-    "# Создаем DataFrame с результатами TF-IDF\n",
-    "df_source_tfidf = pd.DataFrame(X_tfidf, columns=tfidf_vectorizer.get_feature_names_out())\n",
-    "\n",
-    "bins = [0, 30, 40, 50, 60, 70, 80, 100]  # границы для возрастных категорий\n",
+    "bins = [0, 30, 40, 50, 60, 70, 80, 101]  # границы для возрастных категорий\n",
    "labels = ['Under 30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+']  # метки для категорий\n",
    "\n",
-    "# Создаем новую колонку 'age_group', где будет храниться категория\n",
-    "df_age_group = pd.cut(df['Age'], bins=bins, labels=labels, right=False)\n",
-    "\n",
-    "\n",
+    "df[\"Age_category\"] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)\n",
    "# Удаляем оригинальные колонки 'country', 'industry' и 'source' из исходного DataFrame\n",
-    "df.drop(columns=['Country', 'Industry', 'Source', 'Age'], inplace=True)\n",
-    "\n",
-    "# Объединяем все преобразованные данные в один DataFrame\n",
-    "df_transformed = pd.concat([df, df_country, df_industry, df_source_tfidf, df_age_group], axis=1)\n",
+    "df.drop(columns=['Age'], inplace=True)\n",
    "\n",
    "# Просмотр результата\n",
-    "print(df_transformed.head())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Разобьём набор данных на обучающую и тестовые выборки (80/20) для задачи классификации. Целевой признак- Age"
+    "print(df.head())"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
-     "ename": "NameError",
-     "evalue": "name 'df_transformed' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[1;32mIn[18], line 46\u001b[0m\n\u001b[0;32m     42\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(df_input) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_train) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_val) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_test)\n\u001b[0;32m     43\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test, y_train, y_val, y_test\n\u001b[0;32m     45\u001b[0m X_train, X_val, X_test, y_train, y_val, y_test \u001b[38;5;241m=\u001b[39m split_stratified_into_train_val_test(\n\u001b[1;32m---> 46\u001b[0m     \u001b[43mdf_transformed\u001b[49m, stratify_colname\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAge\u001b[39m\u001b[38;5;124m\"\u001b[39m, frac_train\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.80\u001b[39m, frac_val\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m, frac_test\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.20\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m     47\u001b[0m )\n\u001b[0;32m     49\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, X_train)\n\u001b[0;32m     50\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, y_train)\n",
-      "\u001b[1;31mNameError\u001b[0m: name 'df_transformed' is not defined"
+     "data": {
+      "text/plain": [
+       "'X_train'"
      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Rank</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Networth</th>\n",
+       "      <th>Country</th>\n",
+       "      <th>Source</th>\n",
+       "      <th>Industry</th>\n",
+       "      <th>Age_category</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1909</th>\n",
+       "      <td>1818</td>\n",
+       "      <td>Tran Ba Duong &amp; family</td>\n",
+       "      <td>1.6</td>\n",
+       "      <td>Vietnam</td>\n",
+       "      <td>automotive</td>\n",
+       "      <td>Automotive</td>\n",
+       "      <td>60-70</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2099</th>\n",
+       "      <td>2076</td>\n",
+       "      <td>Mark Dixon</td>\n",
+       "      <td>1.4</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "      <td>office real estate</td>\n",
+       "      <td>Real Estate</td>\n",
+       "      <td>60-70</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1392</th>\n",
+       "      <td>1341</td>\n",
+       "      <td>Yingzhuo Xu</td>\n",
+       "      <td>2.3</td>\n",
+       "      <td>China</td>\n",
+       "      <td>agribusiness</td>\n",
+       "      <td>Food &amp; Beverage</td>\n",
+       "      <td>50-60</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>627</th>\n",
+       "      <td>622</td>\n",
+       "      <td>Bruce Flatt</td>\n",
+       "      <td>4.6</td>\n",
+       "      <td>Canada</td>\n",
+       "      <td>money management</td>\n",
+       "      <td>Finance &amp; Investments</td>\n",
+       "      <td>50-60</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>527</th>\n",
+       "      <td>523</td>\n",
+       "      <td>Li Liangbin</td>\n",
+       "      <td>5.2</td>\n",
+       "      <td>China</td>\n",
+       "      <td>lithium</td>\n",
+       "      <td>Manufacturing</td>\n",
+       "      <td>50-60</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>84</th>\n",
+       "      <td>85</td>\n",
+       "      <td>Theo Albrecht, Jr. &amp; family</td>\n",
+       "      <td>18.7</td>\n",
+       "      <td>Germany</td>\n",
+       "      <td>Aldi, Trader Joe's</td>\n",
+       "      <td>Fashion &amp; Retail</td>\n",
+       "      <td>70-80</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>633</th>\n",
+       "      <td>622</td>\n",
+       "      <td>Tony Tamer</td>\n",
+       "      <td>4.6</td>\n",
+       "      <td>United States</td>\n",
+       "      <td>private equity</td>\n",
+       "      <td>Finance &amp; Investments</td>\n",
+       "      <td>60-70</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>922</th>\n",
+       "      <td>913</td>\n",
+       "      <td>Bob Gaglardi</td>\n",
+       "      <td>3.3</td>\n",
+       "      <td>Canada</td>\n",
+       "      <td>hotels</td>\n",
+       "      <td>Real Estate</td>\n",
+       "      <td>80+</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2178</th>\n",
+       "      <td>2076</td>\n",
+       "      <td>Eugene Wu</td>\n",
+       "      <td>1.4</td>\n",
+       "      <td>Taiwan</td>\n",
+       "      <td>finance</td>\n",
+       "      <td>Finance &amp; Investments</td>\n",
+       "      <td>70-80</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>415</th>\n",
+       "      <td>411</td>\n",
+       "      <td>Leonard Stern</td>\n",
+       "      <td>6.2</td>\n",
+       "      <td>United States</td>\n",
+       "      <td>real estate</td>\n",
+       "      <td>Real Estate</td>\n",
+       "      <td>80+</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2080 rows × 7 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      Rank                           Name  Networth         Country  \\\n",
+       "1909   1818       Tran Ba Duong & family        1.6         Vietnam   \n",
+       "2099   2076                   Mark Dixon        1.4  United Kingdom   \n",
+       "1392   1341                  Yingzhuo Xu        2.3           China   \n",
+       "627     622                  Bruce Flatt        4.6          Canada   \n",
+       "527     523                  Li Liangbin        5.2           China   \n",
+       "...     ...                           ...       ...             ...   \n",
+       "84       85  Theo Albrecht, Jr. & family       18.7         Germany   \n",
+       "633     622                   Tony Tamer        4.6   United States   \n",
+       "922     913                 Bob Gaglardi        3.3          Canada   \n",
+       "2178   2076                    Eugene Wu        1.4          Taiwan   \n",
+       "415     411                Leonard Stern        6.2   United States   \n",
+       "\n",
+       "                  Source                Industry Age_category  \n",
+       "1909          automotive             Automotive         60-70  \n",
+       "2099  office real estate            Real Estate         60-70  \n",
+       "1392        agribusiness        Food & Beverage         50-60  \n",
+       "627     money management  Finance & Investments         50-60  \n",
+       "527              lithium          Manufacturing         50-60  \n",
+       "...                  ...                     ...          ...  \n",
+       "84    Aldi, Trader Joe's       Fashion & Retail         70-80  \n",
+       "633       private equity  Finance & Investments         60-70  \n",
+       "922               hotels            Real Estate           80+  \n",
+       "2178             finance  Finance & Investments         70-80  \n",
+       "415          real estate            Real Estate           80+  \n",
+       "\n",
+       "[2080 rows x 7 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'y_train'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Age_category</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1909</th>\n",
+       "      <td>60-70</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2099</th>\n",
+       "      <td>60-70</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1392</th>\n",
+       "      <td>50-60</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>627</th>\n",
+       "      <td>50-60</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>527</th>\n",
+       "      <td>50-60</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>84</th>\n",
+       "      <td>70-80</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>633</th>\n",
+       "      <td>60-70</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>922</th>\n",
+       "      <td>80+</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2178</th>\n",
+       "      <td>70-80</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>415</th>\n",
+       "      <td>80+</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2080 rows × 1 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     Age_category\n",
+       "1909        60-70\n",
+       "2099        60-70\n",
+       "1392        50-60\n",
+       "627         50-60\n",
+       "527         50-60\n",
+       "...           ...\n",
+       "84          70-80\n",
+       "633         60-70\n",
+       "922           80+\n",
+       "2178        70-80\n",
+       "415           80+\n",
+       "\n",
+       "[2080 rows x 1 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'X_test'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Rank</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Networth</th>\n",
+       "      <th>Country</th>\n",
+       "      <th>Source</th>\n",
+       "      <th>Industry</th>\n",
+       "      <th>Age_category</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2075</th>\n",
+       "      <td>2076</td>\n",
+       "      <td>Radhe Shyam Agarwal</td>\n",
+       "      <td>1.4</td>\n",
+       "      <td>India</td>\n",
+       "      <td>consumer goods</td>\n",
+       "      <td>Fashion &amp; Retail</td>\n",
+       "      <td>70-80</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1529</th>\n",
+       "      <td>1513</td>\n",
+       "      <td>Robert Duggan</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>United States</td>\n",
+       "      <td>pharmaceuticals</td>\n",
+       "      <td>Healthcare</td>\n",
+       "      <td>70-80</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1803</th>\n",
+       "      <td>1729</td>\n",
+       "      <td>Yao Kuizhang</td>\n",
+       "      <td>1.7</td>\n",
+       "      <td>China</td>\n",
+       "      <td>beverages</td>\n",
+       "      <td>Food &amp; Beverage</td>\n",
+       "      <td>50-60</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>425</th>\n",
+       "      <td>424</td>\n",
+       "      <td>Alexei Kuzmichev</td>\n",
+       "      <td>6.0</td>\n",
+       "      <td>Russia</td>\n",
+       "      <td>oil, banking, telecom</td>\n",
+       "      <td>Energy</td>\n",
+       "      <td>50-60</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2597</th>\n",
+       "      <td>2578</td>\n",
+       "      <td>Ramesh Genomal</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>Philippines</td>\n",
+       "      <td>apparel</td>\n",
+       "      <td>Fashion &amp; Retail</td>\n",
+       "      <td>70-80</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>935</th>\n",
+       "      <td>913</td>\n",
+       "      <td>Alfred Oetker</td>\n",
+       "      <td>3.3</td>\n",
+       "      <td>Germany</td>\n",
+       "      <td>consumer goods</td>\n",
+       "      <td>Fashion &amp; Retail</td>\n",
+       "      <td>50-60</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1541</th>\n",
+       "      <td>1513</td>\n",
+       "      <td>Thomas Lee</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>United States</td>\n",
+       "      <td>private equity</td>\n",
+       "      <td>Finance &amp; Investments</td>\n",
+       "      <td>70-80</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1646</th>\n",
+       "      <td>1645</td>\n",
+       "      <td>Roberto Angelini Rossi</td>\n",
+       "      <td>1.8</td>\n",
+       "      <td>Chile</td>\n",
+       "      <td>forestry, mining</td>\n",
+       "      <td>diversified</td>\n",
+       "      <td>70-80</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>376</th>\n",
+       "      <td>375</td>\n",
+       "      <td>Patrick Drahi</td>\n",
+       "      <td>6.6</td>\n",
+       "      <td>France</td>\n",
+       "      <td>telecom</td>\n",
+       "      <td>Telecom</td>\n",
+       "      <td>50-60</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1894</th>\n",
+       "      <td>1818</td>\n",
+       "      <td>Gerald Schwartz</td>\n",
+       "      <td>1.6</td>\n",
+       "      <td>Canada</td>\n",
+       "      <td>finance</td>\n",
+       "      <td>Finance &amp; Investments</td>\n",
+       "      <td>80+</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>520 rows × 7 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      Rank                      Name  Networth        Country  \\\n",
+       "2075   2076     Radhe Shyam Agarwal        1.4          India   \n",
+       "1529   1513           Robert Duggan        2.0  United States   \n",
+       "1803   1729            Yao Kuizhang        1.7          China   \n",
+       "425     424        Alexei Kuzmichev        6.0         Russia   \n",
+       "2597   2578          Ramesh Genomal        1.0    Philippines   \n",
+       "...     ...                      ...       ...            ...   \n",
+       "935     913           Alfred Oetker        3.3        Germany   \n",
+       "1541   1513              Thomas Lee        2.0  United States   \n",
+       "1646   1645  Roberto Angelini Rossi        1.8          Chile   \n",
+       "376     375           Patrick Drahi        6.6         France   \n",
+       "1894   1818         Gerald Schwartz        1.6         Canada   \n",
+       "\n",
+       "                     Source                Industry Age_category  \n",
+       "2075         consumer goods       Fashion & Retail         70-80  \n",
+       "1529        pharmaceuticals             Healthcare         70-80  \n",
+       "1803              beverages        Food & Beverage         50-60  \n",
+       "425   oil, banking, telecom                 Energy         50-60  \n",
+       "2597                apparel       Fashion & Retail         70-80  \n",
+       "...                     ...                     ...          ...  \n",
+       "935          consumer goods       Fashion & Retail         50-60  \n",
+       "1541         private equity  Finance & Investments         70-80  \n",
+       "1646       forestry, mining          diversified           70-80  \n",
+       "376                 telecom                Telecom         50-60  \n",
+       "1894                finance  Finance & Investments           80+  \n",
+       "\n",
+       "[520 rows x 7 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'y_test'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Age_category</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2075</th>\n",
+       "      <td>70-80</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1529</th>\n",
+       "      <td>70-80</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1803</th>\n",
+       "      <td>50-60</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>425</th>\n",
+       "      <td>50-60</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2597</th>\n",
+       "      <td>70-80</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>935</th>\n",
+       "      <td>50-60</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1541</th>\n",
+       "      <td>70-80</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1646</th>\n",
+       "      <td>70-80</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>376</th>\n",
+       "      <td>50-60</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1894</th>\n",
+       "      <td>80+</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>520 rows × 1 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     Age_category\n",
+       "2075        70-80\n",
+       "1529        70-80\n",
+       "1803        50-60\n",
+       "425         50-60\n",
+       "2597        70-80\n",
+       "...           ...\n",
+       "935         50-60\n",
+       "1541        70-80\n",
+       "1646        70-80\n",
+       "376         50-60\n",
+       "1894          80+\n",
+       "\n",
+       "[520 rows x 1 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
-    "from typing import Tuple\n",
-    "import pandas as pd\n",
-    "from pandas import DataFrame\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "\n",
-    "def split_stratified_into_train_val_test(\n",
-    "    df_input,\n",
-    "    stratify_colname=\"y\",\n",
-    "    frac_train=0.6,\n",
-    "    frac_val=0.15,\n",
-    "    frac_test=0.25,\n",
-    "    random_state=None,\n",
-    ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
-    "   \n",
-    "    if frac_train + frac_val + frac_test != 1.0:\n",
-    "        raise ValueError(\n",
-    "            \"fractions %f, %f, %f do not add up to 1.0\"\n",
-    "            % (frac_train, frac_val, frac_test)\n",
-    "        )\n",
-    "    if stratify_colname not in df_input.columns:\n",
-    "        raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
-    "    X = df_input  # Contains all columns.\n",
-    "    y = df_input[\n",
-    "        [stratify_colname]\n",
-    "    ]  # Dataframe of just the column on which to stratify.\n",
-    "    # Split original dataframe into train and temp dataframes.\n",
-    "    df_train, df_temp, y_train, y_temp = train_test_split(\n",
-    "        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
-    "    )\n",
-    "    if frac_val <= 0:\n",
-    "        assert len(df_input) == len(df_train) + len(df_temp)\n",
-    "        return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
-    "    # Split the temp dataframe into val and test dataframes.\n",
-    "    relative_frac_test = frac_test / (frac_val + frac_test)\n",
-    "    df_val, df_test, y_val, y_test = train_test_split(\n",
-    "        df_temp,\n",
-    "        y_temp,\n",
-    "        stratify=y_temp,\n",
-    "        test_size=relative_frac_test,\n",
-    "        random_state=random_state,\n",
-    "    )\n",
-    "    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
-    "    return df_train, df_val, df_test, y_train, y_val, y_test\n",
+    "from utils import split_stratified_into_train_val_test\n",
    "\n",
    "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
-    "    df, stratify_colname=\"Age\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=None\n",
+    "    df, stratify_colname=\"Age_category\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=9\n",
    ")\n",
    "\n",
    "display(\"X_train\", X_train)\n",
@ -214,6 +723,135 @@
    "display(\"X_test\", X_test)\n",
    "display(\"y_test\", y_test)"
   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Формирование конвейера для классификации данных\n",
+    "## preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n",
+    "## preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n",
+    "## features_preprocessing -- трансформер для предобработки признаков\n",
+    "## features_engineering -- трансформер для конструирования признаков\n",
+    "## drop_columns -- трансформер для удаления колонок\n",
+    "## pipeline_end -- основной конвейер предобработки данных и конструирования признаков"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from sklearn.compose import ColumnTransformer\n",
+    "from sklearn.discriminant_analysis import StandardScaler\n",
+    "from sklearn.impute import SimpleImputer\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.preprocessing import OneHotEncoder\n",
+    "\n",
+    "\n",
+    "columns_to_drop = [\"Rank \", \"Name\"]\n",
+    "num_columns = [\n",
+    "    column\n",
+    "    for column in df.columns\n",
+    "    if column not in columns_to_drop and df[column].dtype != \"object\"\n",
+    "]\n",
+    "cat_columns = [\n",
+    "    column\n",
+    "    for column in df.columns\n",
+    "    if column not in columns_to_drop and df[column].dtype == \"object\"\n",
+    "]\n",
+    "\n",
+    "num_imputer = SimpleImputer(strategy=\"median\")\n",
+    "num_scaler = StandardScaler()\n",
+    "preprocessing_num = Pipeline(\n",
+    "    [\n",
+    "        (\"imputer\", num_imputer),\n",
+    "        (\"scaler\", num_scaler),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n",
+    "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
+    "preprocessing_cat = Pipeline(\n",
+    "    [\n",
+    "        (\"imputer\", cat_imputer),\n",
+    "        (\"encoder\", cat_encoder),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "features_preprocessing = ColumnTransformer(\n",
+    "    verbose_feature_names_out=False,\n",
+    "    transformers=[\n",
+    "        (\"prepocessing_num\", preprocessing_num, num_columns),\n",
+    "        (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
+    "        (\"prepocessing_features\", cat_imputer),\n",
+    "    ],\n",
+    "    remainder=\"passthrough\"\n",
+    ")\n",
+    "\n",
+    "\n",
+    "drop_columns = ColumnTransformer(\n",
+    "    verbose_feature_names_out=False,\n",
+    "    transformers=[\n",
+    "        (\"drop_columns\", \"drop\", columns_to_drop),\n",
+    "    ],\n",
+    "    remainder=\"passthrough\",\n",
+    ")\n",
+    "\n",
+    "features_postprocessing = ColumnTransformer(\n",
+    "    verbose_feature_names_out=False,\n",
+    "    transformers=[\n",
+    "        (\"prepocessing_cat\", preprocessing_cat, [\"Cabin_type\"]),\n",
+    "    ],\n",
+    "    remainder=\"passthrough\",\n",
+    ")\n",
+    "\n",
+    "pipeline_end = Pipeline(\n",
+    "    [\n",
+    "        (\"features_preprocessing\", features_preprocessing),\n",
+    "        (\"drop_columns\", drop_columns),\n",
+    "        (\"features_postprocessing\", features_postprocessing),\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "not enough values to unpack (expected 3, got 2)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[31], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m preprocessing_result \u001b[38;5;241m=\u001b[39m \u001b[43mpipeline_end\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m      2\u001b[0m preprocessed_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(\n\u001b[0;32m      3\u001b[0m     preprocessing_result,\n\u001b[0;32m      4\u001b[0m     columns\u001b[38;5;241m=\u001b[39mpipeline_end\u001b[38;5;241m.\u001b[39mget_feature_names_out(),\n\u001b[0;32m      5\u001b[0m )\n\u001b[0;32m      7\u001b[0m preprocessed_df\n",
+      "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1466\u001b[0m     estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m   1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m   1469\u001b[0m     skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m   1470\u001b[0m         prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m   1471\u001b[0m     )\n\u001b[0;32m   1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py:533\u001b[0m, in \u001b[0;36mPipeline.fit_transform\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m    490\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Fit the model and transform with the final estimator.\u001b[39;00m\n\u001b[0;32m    491\u001b[0m \n\u001b[0;32m    492\u001b[0m \u001b[38;5;124;03mFit all the transformers one after the other and sequentially transform\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    530\u001b[0m \u001b[38;5;124;03m    Transformed samples.\u001b[39;00m\n\u001b[0;32m    531\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m    532\u001b[0m routed_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_method_params(method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m, props\u001b[38;5;241m=\u001b[39mparams)\n\u001b[1;32m--> 533\u001b[0m Xt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrouted_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    535\u001b[0m last_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_final_estimator\n\u001b[0;32m    536\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_log_message(\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m)):\n",
+      "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py:406\u001b[0m, in \u001b[0;36mPipeline._fit\u001b[1;34m(self, X, y, routed_params)\u001b[0m\n\u001b[0;32m    404\u001b[0m     cloned_transformer \u001b[38;5;241m=\u001b[39m clone(transformer)\n\u001b[0;32m    405\u001b[0m \u001b[38;5;66;03m# Fit or load from cache the current transformer\u001b[39;00m\n\u001b[1;32m--> 406\u001b[0m X, fitted_transformer \u001b[38;5;241m=\u001b[39m \u001b[43mfit_transform_one_cached\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    407\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcloned_transformer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    408\u001b[0m \u001b[43m    \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    409\u001b[0m \u001b[43m    \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    410\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m    411\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmessage_clsname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPipeline\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m    412\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_message\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstep_idx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    413\u001b[0m \u001b[43m    \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrouted_params\u001b[49m\u001b[43m[\u001b[49m\u001b[43mname\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    414\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    415\u001b[0m \u001b[38;5;66;03m# Replace the transformer of the step with the fitted\u001b[39;00m\n\u001b[0;32m    416\u001b[0m \u001b[38;5;66;03m# transformer. This is necessary when loading the transformer\u001b[39;00m\n\u001b[0;32m    417\u001b[0m \u001b[38;5;66;03m# from the cache.\u001b[39;00m\n\u001b[0;32m    418\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps[step_idx] \u001b[38;5;241m=\u001b[39m (name, fitted_transformer)\n",
+      "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\joblib\\memory.py:312\u001b[0m, in \u001b[0;36mNotMemorizedFunc.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m    311\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 312\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py:1310\u001b[0m, in \u001b[0;36m_fit_transform_one\u001b[1;34m(transformer, X, y, weight, message_clsname, message, params)\u001b[0m\n\u001b[0;32m   1308\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(message_clsname, message):\n\u001b[0;32m   1309\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(transformer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m-> 1310\u001b[0m         res \u001b[38;5;241m=\u001b[39m \u001b[43mtransformer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfit_transform\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1311\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m   1312\u001b[0m         res \u001b[38;5;241m=\u001b[39m transformer\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit\u001b[39m\u001b[38;5;124m\"\u001b[39m, {}))\u001b[38;5;241m.\u001b[39mtransform(\n\u001b[0;32m   1313\u001b[0m             X, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtransform\u001b[39m\u001b[38;5;124m\"\u001b[39m, {})\n\u001b[0;32m   1314\u001b[0m         )\n",
+      "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\utils\\_set_output.py:316\u001b[0m, in \u001b[0;36m_wrap_method_output.<locals>.wrapped\u001b[1;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[0;32m    314\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[0;32m    315\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 316\u001b[0m     data_to_wrap \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    317\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[0;32m    318\u001b[0m         \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[0;32m    319\u001b[0m         return_tuple \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m    320\u001b[0m             _wrap_data_with_container(method, data_to_wrap[\u001b[38;5;241m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[0;32m    321\u001b[0m             \u001b[38;5;241m*\u001b[39mdata_to_wrap[\u001b[38;5;241m1\u001b[39m:],\n\u001b[0;32m    322\u001b[0m         )\n",
+      "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1466\u001b[0m     estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m   1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m   1469\u001b[0m     skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m   1470\u001b[0m         prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m   1471\u001b[0m     )\n\u001b[0;32m   1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:965\u001b[0m, in \u001b[0;36mColumnTransformer.fit_transform\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m    963\u001b[0m \u001b[38;5;66;03m# set n_features_in_ attribute\u001b[39;00m\n\u001b[0;32m    964\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_n_features(X, reset\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m--> 965\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_transformers\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    966\u001b[0m n_samples \u001b[38;5;241m=\u001b[39m _num_samples(X)\n\u001b[0;32m    968\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_column_callables(X)\n",
+      "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:501\u001b[0m, in \u001b[0;36mColumnTransformer._validate_transformers\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformers:\n\u001b[0;32m    499\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m--> 501\u001b[0m names, transformers, _ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mzip\u001b[39m(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformers)\n\u001b[0;32m    503\u001b[0m \u001b[38;5;66;03m# validate names\u001b[39;00m\n\u001b[0;32m    504\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_names(names)\n",
+      "\u001b[1;31mValueError\u001b[0m: not enough values to unpack (expected 3, got 2)"
+     ]
+    }
+   ],
+   "source": [
+    "preprocessing_result = pipeline_end.fit_transform(X_train)\n",
+    "preprocessed_df = pd.DataFrame(\n",
+    "    preprocessing_result,\n",
+    "    columns=pipeline_end.get_feature_names_out(),\n",
+    ")\n",
+    "\n",
+    "preprocessed_df"
+   ]
  }
 ],
 "metadata": {