коммит2

2024-11-15 22:37:33 +04:00 · 2024-11-15 22:37:33 +04:00 · 8fc280c7e7
commit 8fc280c7e7
parent 41c4ab91ed
1 changed files with 761 additions and 123 deletions
--- a/lab_4/lab4.ipynb
+++ b/lab_4/lab4.ipynb
@ -33,7 +33,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# Подготовим данные: удалим колонки rank и name(в них уникальные значения, которые не участвуют в предсказаниях). А также преобразуем номинальные колонки в числовые(country, source, industry) и категоризируем колонку age"
+    "# Подготовим данные: категоризируем колонку age"
   ]
  },
  {
@ -45,167 +45,676 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "   Networth  Age        Country              Source                Industry\n",
+      "Rank        0\n",
-      "0     219.0   50  United States       Tesla, SpaceX             Automotive \n",
+      "Name        0\n",
-      "1     171.0   58  United States              Amazon             Technology \n",
+      "Networth    0\n",
-      "2     158.0   73         France                LVMH       Fashion & Retail \n",
+      "Age         0\n",
-      "3     129.0   66  United States           Microsoft             Technology \n",
+      "Country     0\n",
-      "4     118.0   91  United States  Berkshire Hathaway  Finance & Investments \n"
+      "Source      0\n",
      "Industry    0\n",
      "dtype: int64\n",
      "\n",
      "Rank        False\n",
      "Name        False\n",
      "Networth    False\n",
      "Age         False\n",
      "Country     False\n",
      "Source      False\n",
      "Industry    False\n",
      "dtype: bool\n",
      "\n"
     ]
    }
   ],
   "source": [
-    "# Удаление колонок 'rank' и 'name'\n",
+    "print(df.isnull().sum())\n",
    "df.drop(columns=['Rank ', 'Name'], inplace=True)\n",
    "\n",
-    "# Проверка, что колонки были удалены\n",
+    "print()\n",
-    "print(df.head())"
+    "\n",
    "# Есть ли пустые значения признаков\n",
    "print(df.isnull().any())\n",
    "\n",
    "print()\n",
    "\n",
    "# Процент пустых значений признаков\n",
    "for i in df.columns:\n",
    "    null_rate = df[i].isnull().sum() / len(df) * 100\n",
    "    if null_rate > 0:\n",
    "        print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "   Networth  Country_Argentina  Country_Australia  Country_Austria  \\\n",
+      "   Rank                        Name  Networth        Country  \\\n",
-      "0     219.0              False              False            False   \n",
+      "0      1                 Elon Musk      219.0  United States   \n",
-      "1     171.0              False              False            False   \n",
+      "1      2                Jeff Bezos      171.0  United States   \n",
-      "2     158.0              False              False            False   \n",
+      "2      3  Bernard Arnault & family      158.0         France   \n",
-      "3     129.0              False              False            False   \n",
+      "3      4                Bill Gates      129.0  United States   \n",
-      "4     118.0              False              False            False   \n",
+      "4      5            Warren Buffett      118.0  United States   \n",
      "\n",
-      "   Country_Barbados  Country_Belgium  Country_Belize  Country_Brazil  \\\n",
+      "               Source                Industry Age_category  \n",
-      "0             False            False           False           False   \n",
+      "0       Tesla, SpaceX             Automotive         50-60  \n",
-      "1             False            False           False           False   \n",
+      "1              Amazon             Technology         50-60  \n",
-      "2             False            False           False           False   \n",
+      "2                LVMH       Fashion & Retail         70-80  \n",
-      "3             False            False           False           False   \n",
+      "3           Microsoft             Technology         60-70  \n",
-      "4             False            False           False           False   \n",
+      "4  Berkshire Hathaway  Finance & Investments           80+  \n"
      "\n",
      "   Country_Bulgaria  Country_Canada  ...  wind  wine  winter  wire  wireless  \\\n",
      "0             False           False  ...   0.0   0.0     0.0   0.0       0.0   \n",
      "1             False           False  ...   0.0   0.0     0.0   0.0       0.0   \n",
      "2             False           False  ...   0.0   0.0     0.0   0.0       0.0   \n",
      "3             False           False  ...   0.0   0.0     0.0   0.0       0.0   \n",
      "4             False           False  ...   0.0   0.0     0.0   0.0       0.0   \n",
      "\n",
      "   yahoo  yogurt  zara  zoom    Age  \n",
      "0    0.0     0.0   0.0   0.0  50-60  \n",
      "1    0.0     0.0   0.0   0.0  50-60  \n",
      "2    0.0     0.0   0.0   0.0  70-80  \n",
      "3    0.0     0.0   0.0   0.0  60-70  \n",
      "4    0.0     0.0   0.0   0.0    80+  \n",
      "\n",
      "[5 rows x 828 columns]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "# Преобразуем 'country' и 'industry' в бинарные матрицы с помощью One-Hot Encoding\n",
    "df_country = pd.get_dummies(df[['Country']], drop_first=True)\n",
    "df_industry = pd.get_dummies(df[['Industry']], drop_first=True)\n",
    "\n",
-    "# Преобразуем колонку 'source' с помощью TF-IDF\n",
+    "bins = [0, 30, 40, 50, 60, 70, 80, 101]  # границы для возрастных категорий\n",
    "tfidf_vectorizer = TfidfVectorizer(max_features=1000)  \n",
    "X_tfidf = tfidf_vectorizer.fit_transform(df['Source']).toarray()\n",
    "\n",
    "# Создаем DataFrame с результатами TF-IDF\n",
    "df_source_tfidf = pd.DataFrame(X_tfidf, columns=tfidf_vectorizer.get_feature_names_out())\n",
    "\n",
    "bins = [0, 30, 40, 50, 60, 70, 80, 100]  # границы для возрастных категорий\n",
    "labels = ['Under 30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+']  # метки для категорий\n",
    "\n",
-    "# Создаем новую колонку 'age_group', где будет храниться категория\n",
+    "df[\"Age_category\"] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)\n",
    "df_age_group = pd.cut(df['Age'], bins=bins, labels=labels, right=False)\n",
    "\n",
    "\n",
    "# Удаляем оригинальные колонки 'country', 'industry' и 'source' из исходного DataFrame\n",
-    "df.drop(columns=['Country', 'Industry', 'Source', 'Age'], inplace=True)\n",
+    "df.drop(columns=['Age'], inplace=True)\n",
    "\n",
    "# Объединяем все преобразованные данные в один DataFrame\n",
    "df_transformed = pd.concat([df, df_country, df_industry, df_source_tfidf, df_age_group], axis=1)\n",
    "\n",
    "# Просмотр результата\n",
-    "print(df_transformed.head())"
+    "print(df.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Разобьём набор данных на обучающую и тестовые выборки (80/20) для задачи классификации. Целевой признак- Age"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
-     "ename": "NameError",
+     "data": {
-     "evalue": "name 'df_transformed' is not defined",
+      "text/plain": [
-     "output_type": "error",
+       "'X_train'"
-     "traceback": [
+      ]
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+     },
-      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+     "metadata": {},
-      "Cell \u001b[1;32mIn[18], line 46\u001b[0m\n\u001b[0;32m     42\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(df_input) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_train) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_val) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_test)\n\u001b[0;32m     43\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test, y_train, y_val, y_test\n\u001b[0;32m     45\u001b[0m X_train, X_val, X_test, y_train, y_val, y_test \u001b[38;5;241m=\u001b[39m split_stratified_into_train_val_test(\n\u001b[1;32m---> 46\u001b[0m     \u001b[43mdf_transformed\u001b[49m, stratify_colname\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAge\u001b[39m\u001b[38;5;124m\"\u001b[39m, frac_train\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.80\u001b[39m, frac_val\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m, frac_test\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.20\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m     47\u001b[0m )\n\u001b[0;32m     49\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, X_train)\n\u001b[0;32m     50\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, y_train)\n",
+     "output_type": "display_data"
-      "\u001b[1;31mNameError\u001b[0m: name 'df_transformed' is not defined"
+    },
-     ]
+    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Rank</th>\n",
       "      <th>Name</th>\n",
       "      <th>Networth</th>\n",
       "      <th>Country</th>\n",
       "      <th>Source</th>\n",
       "      <th>Industry</th>\n",
       "      <th>Age_category</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1909</th>\n",
       "      <td>1818</td>\n",
       "      <td>Tran Ba Duong &amp; family</td>\n",
       "      <td>1.6</td>\n",
       "      <td>Vietnam</td>\n",
       "      <td>automotive</td>\n",
       "      <td>Automotive</td>\n",
       "      <td>60-70</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2099</th>\n",
       "      <td>2076</td>\n",
       "      <td>Mark Dixon</td>\n",
       "      <td>1.4</td>\n",
       "      <td>United Kingdom</td>\n",
       "      <td>office real estate</td>\n",
       "      <td>Real Estate</td>\n",
       "      <td>60-70</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1392</th>\n",
       "      <td>1341</td>\n",
       "      <td>Yingzhuo Xu</td>\n",
       "      <td>2.3</td>\n",
       "      <td>China</td>\n",
       "      <td>agribusiness</td>\n",
       "      <td>Food &amp; Beverage</td>\n",
       "      <td>50-60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>627</th>\n",
       "      <td>622</td>\n",
       "      <td>Bruce Flatt</td>\n",
       "      <td>4.6</td>\n",
       "      <td>Canada</td>\n",
       "      <td>money management</td>\n",
       "      <td>Finance &amp; Investments</td>\n",
       "      <td>50-60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>527</th>\n",
       "      <td>523</td>\n",
       "      <td>Li Liangbin</td>\n",
       "      <td>5.2</td>\n",
       "      <td>China</td>\n",
       "      <td>lithium</td>\n",
       "      <td>Manufacturing</td>\n",
       "      <td>50-60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84</th>\n",
       "      <td>85</td>\n",
       "      <td>Theo Albrecht, Jr. &amp; family</td>\n",
       "      <td>18.7</td>\n",
       "      <td>Germany</td>\n",
       "      <td>Aldi, Trader Joe's</td>\n",
       "      <td>Fashion &amp; Retail</td>\n",
       "      <td>70-80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>633</th>\n",
       "      <td>622</td>\n",
       "      <td>Tony Tamer</td>\n",
       "      <td>4.6</td>\n",
       "      <td>United States</td>\n",
       "      <td>private equity</td>\n",
       "      <td>Finance &amp; Investments</td>\n",
       "      <td>60-70</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>922</th>\n",
       "      <td>913</td>\n",
       "      <td>Bob Gaglardi</td>\n",
       "      <td>3.3</td>\n",
       "      <td>Canada</td>\n",
       "      <td>hotels</td>\n",
       "      <td>Real Estate</td>\n",
       "      <td>80+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2178</th>\n",
       "      <td>2076</td>\n",
       "      <td>Eugene Wu</td>\n",
       "      <td>1.4</td>\n",
       "      <td>Taiwan</td>\n",
       "      <td>finance</td>\n",
       "      <td>Finance &amp; Investments</td>\n",
       "      <td>70-80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>415</th>\n",
       "      <td>411</td>\n",
       "      <td>Leonard Stern</td>\n",
       "      <td>6.2</td>\n",
       "      <td>United States</td>\n",
       "      <td>real estate</td>\n",
       "      <td>Real Estate</td>\n",
       "      <td>80+</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2080 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      Rank                           Name  Networth         Country  \\\n",
       "1909   1818       Tran Ba Duong & family        1.6         Vietnam   \n",
       "2099   2076                   Mark Dixon        1.4  United Kingdom   \n",
       "1392   1341                  Yingzhuo Xu        2.3           China   \n",
       "627     622                  Bruce Flatt        4.6          Canada   \n",
       "527     523                  Li Liangbin        5.2           China   \n",
       "...     ...                           ...       ...             ...   \n",
       "84       85  Theo Albrecht, Jr. & family       18.7         Germany   \n",
       "633     622                   Tony Tamer        4.6   United States   \n",
       "922     913                 Bob Gaglardi        3.3          Canada   \n",
       "2178   2076                    Eugene Wu        1.4          Taiwan   \n",
       "415     411                Leonard Stern        6.2   United States   \n",
       "\n",
       "                  Source                Industry Age_category  \n",
       "1909          automotive             Automotive         60-70  \n",
       "2099  office real estate            Real Estate         60-70  \n",
       "1392        agribusiness        Food & Beverage         50-60  \n",
       "627     money management  Finance & Investments         50-60  \n",
       "527              lithium          Manufacturing         50-60  \n",
       "...                  ...                     ...          ...  \n",
       "84    Aldi, Trader Joe's       Fashion & Retail         70-80  \n",
       "633       private equity  Finance & Investments         60-70  \n",
       "922               hotels            Real Estate           80+  \n",
       "2178             finance  Finance & Investments         70-80  \n",
       "415          real estate            Real Estate           80+  \n",
       "\n",
       "[2080 rows x 7 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'y_train'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age_category</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1909</th>\n",
       "      <td>60-70</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2099</th>\n",
       "      <td>60-70</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1392</th>\n",
       "      <td>50-60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>627</th>\n",
       "      <td>50-60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>527</th>\n",
       "      <td>50-60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84</th>\n",
       "      <td>70-80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>633</th>\n",
       "      <td>60-70</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>922</th>\n",
       "      <td>80+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2178</th>\n",
       "      <td>70-80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>415</th>\n",
       "      <td>80+</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2080 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Age_category\n",
       "1909        60-70\n",
       "2099        60-70\n",
       "1392        50-60\n",
       "627         50-60\n",
       "527         50-60\n",
       "...           ...\n",
       "84          70-80\n",
       "633         60-70\n",
       "922           80+\n",
       "2178        70-80\n",
       "415           80+\n",
       "\n",
       "[2080 rows x 1 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'X_test'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Rank</th>\n",
       "      <th>Name</th>\n",
       "      <th>Networth</th>\n",
       "      <th>Country</th>\n",
       "      <th>Source</th>\n",
       "      <th>Industry</th>\n",
       "      <th>Age_category</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2075</th>\n",
       "      <td>2076</td>\n",
       "      <td>Radhe Shyam Agarwal</td>\n",
       "      <td>1.4</td>\n",
       "      <td>India</td>\n",
       "      <td>consumer goods</td>\n",
       "      <td>Fashion &amp; Retail</td>\n",
       "      <td>70-80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1529</th>\n",
       "      <td>1513</td>\n",
       "      <td>Robert Duggan</td>\n",
       "      <td>2.0</td>\n",
       "      <td>United States</td>\n",
       "      <td>pharmaceuticals</td>\n",
       "      <td>Healthcare</td>\n",
       "      <td>70-80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1803</th>\n",
       "      <td>1729</td>\n",
       "      <td>Yao Kuizhang</td>\n",
       "      <td>1.7</td>\n",
       "      <td>China</td>\n",
       "      <td>beverages</td>\n",
       "      <td>Food &amp; Beverage</td>\n",
       "      <td>50-60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>425</th>\n",
       "      <td>424</td>\n",
       "      <td>Alexei Kuzmichev</td>\n",
       "      <td>6.0</td>\n",
       "      <td>Russia</td>\n",
       "      <td>oil, banking, telecom</td>\n",
       "      <td>Energy</td>\n",
       "      <td>50-60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2597</th>\n",
       "      <td>2578</td>\n",
       "      <td>Ramesh Genomal</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Philippines</td>\n",
       "      <td>apparel</td>\n",
       "      <td>Fashion &amp; Retail</td>\n",
       "      <td>70-80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>935</th>\n",
       "      <td>913</td>\n",
       "      <td>Alfred Oetker</td>\n",
       "      <td>3.3</td>\n",
       "      <td>Germany</td>\n",
       "      <td>consumer goods</td>\n",
       "      <td>Fashion &amp; Retail</td>\n",
       "      <td>50-60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1541</th>\n",
       "      <td>1513</td>\n",
       "      <td>Thomas Lee</td>\n",
       "      <td>2.0</td>\n",
       "      <td>United States</td>\n",
       "      <td>private equity</td>\n",
       "      <td>Finance &amp; Investments</td>\n",
       "      <td>70-80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1646</th>\n",
       "      <td>1645</td>\n",
       "      <td>Roberto Angelini Rossi</td>\n",
       "      <td>1.8</td>\n",
       "      <td>Chile</td>\n",
       "      <td>forestry, mining</td>\n",
       "      <td>diversified</td>\n",
       "      <td>70-80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>376</th>\n",
       "      <td>375</td>\n",
       "      <td>Patrick Drahi</td>\n",
       "      <td>6.6</td>\n",
       "      <td>France</td>\n",
       "      <td>telecom</td>\n",
       "      <td>Telecom</td>\n",
       "      <td>50-60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1894</th>\n",
       "      <td>1818</td>\n",
       "      <td>Gerald Schwartz</td>\n",
       "      <td>1.6</td>\n",
       "      <td>Canada</td>\n",
       "      <td>finance</td>\n",
       "      <td>Finance &amp; Investments</td>\n",
       "      <td>80+</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>520 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      Rank                      Name  Networth        Country  \\\n",
       "2075   2076     Radhe Shyam Agarwal        1.4          India   \n",
       "1529   1513           Robert Duggan        2.0  United States   \n",
       "1803   1729            Yao Kuizhang        1.7          China   \n",
       "425     424        Alexei Kuzmichev        6.0         Russia   \n",
       "2597   2578          Ramesh Genomal        1.0    Philippines   \n",
       "...     ...                      ...       ...            ...   \n",
       "935     913           Alfred Oetker        3.3        Germany   \n",
       "1541   1513              Thomas Lee        2.0  United States   \n",
       "1646   1645  Roberto Angelini Rossi        1.8          Chile   \n",
       "376     375           Patrick Drahi        6.6         France   \n",
       "1894   1818         Gerald Schwartz        1.6         Canada   \n",
       "\n",
       "                     Source                Industry Age_category  \n",
       "2075         consumer goods       Fashion & Retail         70-80  \n",
       "1529        pharmaceuticals             Healthcare         70-80  \n",
       "1803              beverages        Food & Beverage         50-60  \n",
       "425   oil, banking, telecom                 Energy         50-60  \n",
       "2597                apparel       Fashion & Retail         70-80  \n",
       "...                     ...                     ...          ...  \n",
       "935          consumer goods       Fashion & Retail         50-60  \n",
       "1541         private equity  Finance & Investments         70-80  \n",
       "1646       forestry, mining          diversified           70-80  \n",
       "376                 telecom                Telecom         50-60  \n",
       "1894                finance  Finance & Investments           80+  \n",
       "\n",
       "[520 rows x 7 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'y_test'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age_category</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2075</th>\n",
       "      <td>70-80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1529</th>\n",
       "      <td>70-80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1803</th>\n",
       "      <td>50-60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>425</th>\n",
       "      <td>50-60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2597</th>\n",
       "      <td>70-80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>935</th>\n",
       "      <td>50-60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1541</th>\n",
       "      <td>70-80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1646</th>\n",
       "      <td>70-80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>376</th>\n",
       "      <td>50-60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1894</th>\n",
       "      <td>80+</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>520 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Age_category\n",
       "2075        70-80\n",
       "1529        70-80\n",
       "1803        50-60\n",
       "425         50-60\n",
       "2597        70-80\n",
       "...           ...\n",
       "935         50-60\n",
       "1541        70-80\n",
       "1646        70-80\n",
       "376         50-60\n",
       "1894          80+\n",
       "\n",
       "[520 rows x 1 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
-    "from typing import Tuple\n",
+    "from utils import split_stratified_into_train_val_test\n",
    "import pandas as pd\n",
    "from pandas import DataFrame\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "def split_stratified_into_train_val_test(\n",
    "    df_input,\n",
    "    stratify_colname=\"y\",\n",
    "    frac_train=0.6,\n",
    "    frac_val=0.15,\n",
    "    frac_test=0.25,\n",
    "    random_state=None,\n",
    ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
    "   \n",
    "    if frac_train + frac_val + frac_test != 1.0:\n",
    "        raise ValueError(\n",
    "            \"fractions %f, %f, %f do not add up to 1.0\"\n",
    "            % (frac_train, frac_val, frac_test)\n",
    "        )\n",
    "    if stratify_colname not in df_input.columns:\n",
    "        raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
    "    X = df_input  # Contains all columns.\n",
    "    y = df_input[\n",
    "        [stratify_colname]\n",
    "    ]  # Dataframe of just the column on which to stratify.\n",
    "    # Split original dataframe into train and temp dataframes.\n",
    "    df_train, df_temp, y_train, y_temp = train_test_split(\n",
    "        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
    "    )\n",
    "    if frac_val <= 0:\n",
    "        assert len(df_input) == len(df_train) + len(df_temp)\n",
    "        return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
    "    # Split the temp dataframe into val and test dataframes.\n",
    "    relative_frac_test = frac_test / (frac_val + frac_test)\n",
    "    df_val, df_test, y_val, y_test = train_test_split(\n",
    "        df_temp,\n",
    "        y_temp,\n",
    "        stratify=y_temp,\n",
    "        test_size=relative_frac_test,\n",
    "        random_state=random_state,\n",
    "    )\n",
    "    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
    "    return df_train, df_val, df_test, y_train, y_val, y_test\n",
    "\n",
    "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
-    "    df, stratify_colname=\"Age\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=None\n",
+    "    df, stratify_colname=\"Age_category\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=9\n",
    ")\n",
    "\n",
    "display(\"X_train\", X_train)\n",
@ -214,6 +723,135 @@
    "display(\"X_test\", X_test)\n",
    "display(\"y_test\", y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Формирование конвейера для классификации данных\n",
    "## preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n",
    "## preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n",
    "## features_preprocessing -- трансформер для предобработки признаков\n",
    "## features_engineering -- трансформер для конструирования признаков\n",
    "## drop_columns -- трансформер для удаления колонок\n",
    "## pipeline_end -- основной конвейер предобработки данных и конструирования признаков"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.discriminant_analysis import StandardScaler\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "\n",
    "\n",
    "columns_to_drop = [\"Rank \", \"Name\"]\n",
    "num_columns = [\n",
    "    column\n",
    "    for column in df.columns\n",
    "    if column not in columns_to_drop and df[column].dtype != \"object\"\n",
    "]\n",
    "cat_columns = [\n",
    "    column\n",
    "    for column in df.columns\n",
    "    if column not in columns_to_drop and df[column].dtype == \"object\"\n",
    "]\n",
    "\n",
    "num_imputer = SimpleImputer(strategy=\"median\")\n",
    "num_scaler = StandardScaler()\n",
    "preprocessing_num = Pipeline(\n",
    "    [\n",
    "        (\"imputer\", num_imputer),\n",
    "        (\"scaler\", num_scaler),\n",
    "    ]\n",
    ")\n",
    "\n",
    "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n",
    "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
    "preprocessing_cat = Pipeline(\n",
    "    [\n",
    "        (\"imputer\", cat_imputer),\n",
    "        (\"encoder\", cat_encoder),\n",
    "    ]\n",
    ")\n",
    "\n",
    "features_preprocessing = ColumnTransformer(\n",
    "    verbose_feature_names_out=False,\n",
    "    transformers=[\n",
    "        (\"prepocessing_num\", preprocessing_num, num_columns),\n",
    "        (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
    "        (\"prepocessing_features\", cat_imputer),\n",
    "    ],\n",
    "    remainder=\"passthrough\"\n",
    ")\n",
    "\n",
    "\n",
    "drop_columns = ColumnTransformer(\n",
    "    verbose_feature_names_out=False,\n",
    "    transformers=[\n",
    "        (\"drop_columns\", \"drop\", columns_to_drop),\n",
    "    ],\n",
    "    remainder=\"passthrough\",\n",
    ")\n",
    "\n",
    "features_postprocessing = ColumnTransformer(\n",
    "    verbose_feature_names_out=False,\n",
    "    transformers=[\n",
    "        (\"prepocessing_cat\", preprocessing_cat, [\"Cabin_type\"]),\n",
    "    ],\n",
    "    remainder=\"passthrough\",\n",
    ")\n",
    "\n",
    "pipeline_end = Pipeline(\n",
    "    [\n",
    "        (\"features_preprocessing\", features_preprocessing),\n",
    "        (\"drop_columns\", drop_columns),\n",
    "        (\"features_postprocessing\", features_postprocessing),\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "not enough values to unpack (expected 3, got 2)",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[31], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m preprocessing_result \u001b[38;5;241m=\u001b[39m \u001b[43mpipeline_end\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m      2\u001b[0m preprocessed_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(\n\u001b[0;32m      3\u001b[0m     preprocessing_result,\n\u001b[0;32m      4\u001b[0m     columns\u001b[38;5;241m=\u001b[39mpipeline_end\u001b[38;5;241m.\u001b[39mget_feature_names_out(),\n\u001b[0;32m      5\u001b[0m )\n\u001b[0;32m      7\u001b[0m preprocessed_df\n",
      "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1466\u001b[0m     estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m   1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m   1469\u001b[0m     skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m   1470\u001b[0m         prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m   1471\u001b[0m     )\n\u001b[0;32m   1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py:533\u001b[0m, in \u001b[0;36mPipeline.fit_transform\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m    490\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Fit the model and transform with the final estimator.\u001b[39;00m\n\u001b[0;32m    491\u001b[0m \n\u001b[0;32m    492\u001b[0m \u001b[38;5;124;03mFit all the transformers one after the other and sequentially transform\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    530\u001b[0m \u001b[38;5;124;03m    Transformed samples.\u001b[39;00m\n\u001b[0;32m    531\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m    532\u001b[0m routed_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_method_params(method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m, props\u001b[38;5;241m=\u001b[39mparams)\n\u001b[1;32m--> 533\u001b[0m Xt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrouted_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    535\u001b[0m last_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_final_estimator\n\u001b[0;32m    536\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_log_message(\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m)):\n",
      "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py:406\u001b[0m, in \u001b[0;36mPipeline._fit\u001b[1;34m(self, X, y, routed_params)\u001b[0m\n\u001b[0;32m    404\u001b[0m     cloned_transformer \u001b[38;5;241m=\u001b[39m clone(transformer)\n\u001b[0;32m    405\u001b[0m \u001b[38;5;66;03m# Fit or load from cache the current transformer\u001b[39;00m\n\u001b[1;32m--> 406\u001b[0m X, fitted_transformer \u001b[38;5;241m=\u001b[39m \u001b[43mfit_transform_one_cached\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    407\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcloned_transformer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    408\u001b[0m \u001b[43m    \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    409\u001b[0m \u001b[43m    \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    410\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m    411\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmessage_clsname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPipeline\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m    412\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_message\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstep_idx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    413\u001b[0m \u001b[43m    \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrouted_params\u001b[49m\u001b[43m[\u001b[49m\u001b[43mname\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    414\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    415\u001b[0m \u001b[38;5;66;03m# Replace the transformer of the step with the fitted\u001b[39;00m\n\u001b[0;32m    416\u001b[0m \u001b[38;5;66;03m# transformer. This is necessary when loading the transformer\u001b[39;00m\n\u001b[0;32m    417\u001b[0m \u001b[38;5;66;03m# from the cache.\u001b[39;00m\n\u001b[0;32m    418\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps[step_idx] \u001b[38;5;241m=\u001b[39m (name, fitted_transformer)\n",
      "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\joblib\\memory.py:312\u001b[0m, in \u001b[0;36mNotMemorizedFunc.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m    311\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 312\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py:1310\u001b[0m, in \u001b[0;36m_fit_transform_one\u001b[1;34m(transformer, X, y, weight, message_clsname, message, params)\u001b[0m\n\u001b[0;32m   1308\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(message_clsname, message):\n\u001b[0;32m   1309\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(transformer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m-> 1310\u001b[0m         res \u001b[38;5;241m=\u001b[39m \u001b[43mtransformer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfit_transform\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1311\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m   1312\u001b[0m         res \u001b[38;5;241m=\u001b[39m transformer\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit\u001b[39m\u001b[38;5;124m\"\u001b[39m, {}))\u001b[38;5;241m.\u001b[39mtransform(\n\u001b[0;32m   1313\u001b[0m             X, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtransform\u001b[39m\u001b[38;5;124m\"\u001b[39m, {})\n\u001b[0;32m   1314\u001b[0m         )\n",
      "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\utils\\_set_output.py:316\u001b[0m, in \u001b[0;36m_wrap_method_output.<locals>.wrapped\u001b[1;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[0;32m    314\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[0;32m    315\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 316\u001b[0m     data_to_wrap \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    317\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[0;32m    318\u001b[0m         \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[0;32m    319\u001b[0m         return_tuple \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m    320\u001b[0m             _wrap_data_with_container(method, data_to_wrap[\u001b[38;5;241m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[0;32m    321\u001b[0m             \u001b[38;5;241m*\u001b[39mdata_to_wrap[\u001b[38;5;241m1\u001b[39m:],\n\u001b[0;32m    322\u001b[0m         )\n",
      "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1466\u001b[0m     estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m   1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m   1469\u001b[0m     skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m   1470\u001b[0m         prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m   1471\u001b[0m     )\n\u001b[0;32m   1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:965\u001b[0m, in \u001b[0;36mColumnTransformer.fit_transform\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m    963\u001b[0m \u001b[38;5;66;03m# set n_features_in_ attribute\u001b[39;00m\n\u001b[0;32m    964\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_n_features(X, reset\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m--> 965\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_transformers\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    966\u001b[0m n_samples \u001b[38;5;241m=\u001b[39m _num_samples(X)\n\u001b[0;32m    968\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_column_callables(X)\n",
      "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:501\u001b[0m, in \u001b[0;36mColumnTransformer._validate_transformers\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformers:\n\u001b[0;32m    499\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m--> 501\u001b[0m names, transformers, _ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mzip\u001b[39m(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformers)\n\u001b[0;32m    503\u001b[0m \u001b[38;5;66;03m# validate names\u001b[39;00m\n\u001b[0;32m    504\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_names(names)\n",
      "\u001b[1;31mValueError\u001b[0m: not enough values to unpack (expected 3, got 2)"
     ]
    }
   ],
   "source": [
    "preprocessing_result = pipeline_end.fit_transform(X_train)\n",
    "preprocessed_df = pd.DataFrame(\n",
    "    preprocessing_result,\n",
    "    columns=pipeline_end.get_feature_names_out(),\n",
    ")\n",
    "\n",
    "preprocessed_df"
   ]
  }
 ],
 "metadata": {