diff --git a/lab_4/lab4.ipynb b/lab_4/lab4.ipynb
index f016a48..ba7919c 100644
--- a/lab_4/lab4.ipynb
+++ b/lab_4/lab4.ipynb
@@ -33,7 +33,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Подготовим данные: удалим колонки rank и name(в них уникальные значения, которые не участвуют в предсказаниях). А также преобразуем номинальные колонки в числовые(country, source, industry) и категоризируем колонку age"
+ "# Подготовим данные: категоризируем колонку age"
]
},
{
@@ -45,167 +45,676 @@
"name": "stdout",
"output_type": "stream",
"text": [
- " Networth Age Country Source Industry\n",
- "0 219.0 50 United States Tesla, SpaceX Automotive \n",
- "1 171.0 58 United States Amazon Technology \n",
- "2 158.0 73 France LVMH Fashion & Retail \n",
- "3 129.0 66 United States Microsoft Technology \n",
- "4 118.0 91 United States Berkshire Hathaway Finance & Investments \n"
+ "Rank 0\n",
+ "Name 0\n",
+ "Networth 0\n",
+ "Age 0\n",
+ "Country 0\n",
+ "Source 0\n",
+ "Industry 0\n",
+ "dtype: int64\n",
+ "\n",
+ "Rank False\n",
+ "Name False\n",
+ "Networth False\n",
+ "Age False\n",
+ "Country False\n",
+ "Source False\n",
+ "Industry False\n",
+ "dtype: bool\n",
+ "\n"
]
}
],
"source": [
- "# Удаление колонок 'rank' и 'name'\n",
- "df.drop(columns=['Rank ', 'Name'], inplace=True)\n",
+ "print(df.isnull().sum())\n",
"\n",
- "# Проверка, что колонки были удалены\n",
- "print(df.head())"
+ "print()\n",
+ "\n",
+ "# Есть ли пустые значения признаков\n",
+ "print(df.isnull().any())\n",
+ "\n",
+ "print()\n",
+ "\n",
+ "# Процент пустых значений признаков\n",
+ "for i in df.columns:\n",
+ " null_rate = df[i].isnull().sum() / len(df) * 100\n",
+ " if null_rate > 0:\n",
+ " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
]
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- " Networth Country_Argentina Country_Australia Country_Austria \\\n",
- "0 219.0 False False False \n",
- "1 171.0 False False False \n",
- "2 158.0 False False False \n",
- "3 129.0 False False False \n",
- "4 118.0 False False False \n",
+ " Rank Name Networth Country \\\n",
+ "0 1 Elon Musk 219.0 United States \n",
+ "1 2 Jeff Bezos 171.0 United States \n",
+ "2 3 Bernard Arnault & family 158.0 France \n",
+ "3 4 Bill Gates 129.0 United States \n",
+ "4 5 Warren Buffett 118.0 United States \n",
"\n",
- " Country_Barbados Country_Belgium Country_Belize Country_Brazil \\\n",
- "0 False False False False \n",
- "1 False False False False \n",
- "2 False False False False \n",
- "3 False False False False \n",
- "4 False False False False \n",
- "\n",
- " Country_Bulgaria Country_Canada ... wind wine winter wire wireless \\\n",
- "0 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
- "1 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
- "2 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
- "3 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
- "4 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
- "\n",
- " yahoo yogurt zara zoom Age \n",
- "0 0.0 0.0 0.0 0.0 50-60 \n",
- "1 0.0 0.0 0.0 0.0 50-60 \n",
- "2 0.0 0.0 0.0 0.0 70-80 \n",
- "3 0.0 0.0 0.0 0.0 60-70 \n",
- "4 0.0 0.0 0.0 0.0 80+ \n",
- "\n",
- "[5 rows x 828 columns]\n"
+ " Source Industry Age_category \n",
+ "0 Tesla, SpaceX Automotive 50-60 \n",
+ "1 Amazon Technology 50-60 \n",
+ "2 LVMH Fashion & Retail 70-80 \n",
+ "3 Microsoft Technology 60-70 \n",
+ "4 Berkshire Hathaway Finance & Investments 80+ \n"
]
}
],
"source": [
- "from sklearn.preprocessing import OneHotEncoder\n",
- "from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
- "# Преобразуем 'country' и 'industry' в бинарные матрицы с помощью One-Hot Encoding\n",
- "df_country = pd.get_dummies(df[['Country']], drop_first=True)\n",
- "df_industry = pd.get_dummies(df[['Industry']], drop_first=True)\n",
"\n",
- "# Преобразуем колонку 'source' с помощью TF-IDF\n",
- "tfidf_vectorizer = TfidfVectorizer(max_features=1000) \n",
- "X_tfidf = tfidf_vectorizer.fit_transform(df['Source']).toarray()\n",
- "\n",
- "# Создаем DataFrame с результатами TF-IDF\n",
- "df_source_tfidf = pd.DataFrame(X_tfidf, columns=tfidf_vectorizer.get_feature_names_out())\n",
- "\n",
- "bins = [0, 30, 40, 50, 60, 70, 80, 100] # границы для возрастных категорий\n",
+ "bins = [0, 30, 40, 50, 60, 70, 80, 101] # границы для возрастных категорий\n",
"labels = ['Under 30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'] # метки для категорий\n",
"\n",
- "# Создаем новую колонку 'age_group', где будет храниться категория\n",
- "df_age_group = pd.cut(df['Age'], bins=bins, labels=labels, right=False)\n",
- "\n",
- "\n",
+ "df[\"Age_category\"] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)\n",
"# Удаляем оригинальные колонки 'country', 'industry' и 'source' из исходного DataFrame\n",
- "df.drop(columns=['Country', 'Industry', 'Source', 'Age'], inplace=True)\n",
- "\n",
- "# Объединяем все преобразованные данные в один DataFrame\n",
- "df_transformed = pd.concat([df, df_country, df_industry, df_source_tfidf, df_age_group], axis=1)\n",
+ "df.drop(columns=['Age'], inplace=True)\n",
"\n",
"# Просмотр результата\n",
- "print(df_transformed.head())"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Разобьём набор данных на обучающую и тестовые выборки (80/20) для задачи классификации. Целевой признак- Age"
+ "print(df.head())"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 27,
"metadata": {},
"outputs": [
{
- "ename": "NameError",
- "evalue": "name 'df_transformed' is not defined",
- "output_type": "error",
- "traceback": [
- "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
- "Cell \u001b[1;32mIn[18], line 46\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(df_input) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_train) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_val) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_test)\n\u001b[0;32m 43\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test, y_train, y_val, y_test\n\u001b[0;32m 45\u001b[0m X_train, X_val, X_test, y_train, y_val, y_test \u001b[38;5;241m=\u001b[39m split_stratified_into_train_val_test(\n\u001b[1;32m---> 46\u001b[0m \u001b[43mdf_transformed\u001b[49m, stratify_colname\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAge\u001b[39m\u001b[38;5;124m\"\u001b[39m, frac_train\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.80\u001b[39m, frac_val\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m, frac_test\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.20\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 47\u001b[0m )\n\u001b[0;32m 49\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, X_train)\n\u001b[0;32m 50\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, y_train)\n",
- "\u001b[1;31mNameError\u001b[0m: name 'df_transformed' is not defined"
- ]
+ "data": {
+ "text/plain": [
+ "'X_train'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Rank | \n",
+ " Name | \n",
+ " Networth | \n",
+ " Country | \n",
+ " Source | \n",
+ " Industry | \n",
+ " Age_category | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1909 | \n",
+ " 1818 | \n",
+ " Tran Ba Duong & family | \n",
+ " 1.6 | \n",
+ " Vietnam | \n",
+ " automotive | \n",
+ " Automotive | \n",
+ " 60-70 | \n",
+ "
\n",
+ " \n",
+ " 2099 | \n",
+ " 2076 | \n",
+ " Mark Dixon | \n",
+ " 1.4 | \n",
+ " United Kingdom | \n",
+ " office real estate | \n",
+ " Real Estate | \n",
+ " 60-70 | \n",
+ "
\n",
+ " \n",
+ " 1392 | \n",
+ " 1341 | \n",
+ " Yingzhuo Xu | \n",
+ " 2.3 | \n",
+ " China | \n",
+ " agribusiness | \n",
+ " Food & Beverage | \n",
+ " 50-60 | \n",
+ "
\n",
+ " \n",
+ " 627 | \n",
+ " 622 | \n",
+ " Bruce Flatt | \n",
+ " 4.6 | \n",
+ " Canada | \n",
+ " money management | \n",
+ " Finance & Investments | \n",
+ " 50-60 | \n",
+ "
\n",
+ " \n",
+ " 527 | \n",
+ " 523 | \n",
+ " Li Liangbin | \n",
+ " 5.2 | \n",
+ " China | \n",
+ " lithium | \n",
+ " Manufacturing | \n",
+ " 50-60 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 84 | \n",
+ " 85 | \n",
+ " Theo Albrecht, Jr. & family | \n",
+ " 18.7 | \n",
+ " Germany | \n",
+ " Aldi, Trader Joe's | \n",
+ " Fashion & Retail | \n",
+ " 70-80 | \n",
+ "
\n",
+ " \n",
+ " 633 | \n",
+ " 622 | \n",
+ " Tony Tamer | \n",
+ " 4.6 | \n",
+ " United States | \n",
+ " private equity | \n",
+ " Finance & Investments | \n",
+ " 60-70 | \n",
+ "
\n",
+ " \n",
+ " 922 | \n",
+ " 913 | \n",
+ " Bob Gaglardi | \n",
+ " 3.3 | \n",
+ " Canada | \n",
+ " hotels | \n",
+ " Real Estate | \n",
+ " 80+ | \n",
+ "
\n",
+ " \n",
+ " 2178 | \n",
+ " 2076 | \n",
+ " Eugene Wu | \n",
+ " 1.4 | \n",
+ " Taiwan | \n",
+ " finance | \n",
+ " Finance & Investments | \n",
+ " 70-80 | \n",
+ "
\n",
+ " \n",
+ " 415 | \n",
+ " 411 | \n",
+ " Leonard Stern | \n",
+ " 6.2 | \n",
+ " United States | \n",
+ " real estate | \n",
+ " Real Estate | \n",
+ " 80+ | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2080 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Rank Name Networth Country \\\n",
+ "1909 1818 Tran Ba Duong & family 1.6 Vietnam \n",
+ "2099 2076 Mark Dixon 1.4 United Kingdom \n",
+ "1392 1341 Yingzhuo Xu 2.3 China \n",
+ "627 622 Bruce Flatt 4.6 Canada \n",
+ "527 523 Li Liangbin 5.2 China \n",
+ "... ... ... ... ... \n",
+ "84 85 Theo Albrecht, Jr. & family 18.7 Germany \n",
+ "633 622 Tony Tamer 4.6 United States \n",
+ "922 913 Bob Gaglardi 3.3 Canada \n",
+ "2178 2076 Eugene Wu 1.4 Taiwan \n",
+ "415 411 Leonard Stern 6.2 United States \n",
+ "\n",
+ " Source Industry Age_category \n",
+ "1909 automotive Automotive 60-70 \n",
+ "2099 office real estate Real Estate 60-70 \n",
+ "1392 agribusiness Food & Beverage 50-60 \n",
+ "627 money management Finance & Investments 50-60 \n",
+ "527 lithium Manufacturing 50-60 \n",
+ "... ... ... ... \n",
+ "84 Aldi, Trader Joe's Fashion & Retail 70-80 \n",
+ "633 private equity Finance & Investments 60-70 \n",
+ "922 hotels Real Estate 80+ \n",
+ "2178 finance Finance & Investments 70-80 \n",
+ "415 real estate Real Estate 80+ \n",
+ "\n",
+ "[2080 rows x 7 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'y_train'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Age_category | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1909 | \n",
+ " 60-70 | \n",
+ "
\n",
+ " \n",
+ " 2099 | \n",
+ " 60-70 | \n",
+ "
\n",
+ " \n",
+ " 1392 | \n",
+ " 50-60 | \n",
+ "
\n",
+ " \n",
+ " 627 | \n",
+ " 50-60 | \n",
+ "
\n",
+ " \n",
+ " 527 | \n",
+ " 50-60 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 84 | \n",
+ " 70-80 | \n",
+ "
\n",
+ " \n",
+ " 633 | \n",
+ " 60-70 | \n",
+ "
\n",
+ " \n",
+ " 922 | \n",
+ " 80+ | \n",
+ "
\n",
+ " \n",
+ " 2178 | \n",
+ " 70-80 | \n",
+ "
\n",
+ " \n",
+ " 415 | \n",
+ " 80+ | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2080 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Age_category\n",
+ "1909 60-70\n",
+ "2099 60-70\n",
+ "1392 50-60\n",
+ "627 50-60\n",
+ "527 50-60\n",
+ "... ...\n",
+ "84 70-80\n",
+ "633 60-70\n",
+ "922 80+\n",
+ "2178 70-80\n",
+ "415 80+\n",
+ "\n",
+ "[2080 rows x 1 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'X_test'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Rank | \n",
+ " Name | \n",
+ " Networth | \n",
+ " Country | \n",
+ " Source | \n",
+ " Industry | \n",
+ " Age_category | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2075 | \n",
+ " 2076 | \n",
+ " Radhe Shyam Agarwal | \n",
+ " 1.4 | \n",
+ " India | \n",
+ " consumer goods | \n",
+ " Fashion & Retail | \n",
+ " 70-80 | \n",
+ "
\n",
+ " \n",
+ " 1529 | \n",
+ " 1513 | \n",
+ " Robert Duggan | \n",
+ " 2.0 | \n",
+ " United States | \n",
+ " pharmaceuticals | \n",
+ " Healthcare | \n",
+ " 70-80 | \n",
+ "
\n",
+ " \n",
+ " 1803 | \n",
+ " 1729 | \n",
+ " Yao Kuizhang | \n",
+ " 1.7 | \n",
+ " China | \n",
+ " beverages | \n",
+ " Food & Beverage | \n",
+ " 50-60 | \n",
+ "
\n",
+ " \n",
+ " 425 | \n",
+ " 424 | \n",
+ " Alexei Kuzmichev | \n",
+ " 6.0 | \n",
+ " Russia | \n",
+ " oil, banking, telecom | \n",
+ " Energy | \n",
+ " 50-60 | \n",
+ "
\n",
+ " \n",
+ " 2597 | \n",
+ " 2578 | \n",
+ " Ramesh Genomal | \n",
+ " 1.0 | \n",
+ " Philippines | \n",
+ " apparel | \n",
+ " Fashion & Retail | \n",
+ " 70-80 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 935 | \n",
+ " 913 | \n",
+ " Alfred Oetker | \n",
+ " 3.3 | \n",
+ " Germany | \n",
+ " consumer goods | \n",
+ " Fashion & Retail | \n",
+ " 50-60 | \n",
+ "
\n",
+ " \n",
+ " 1541 | \n",
+ " 1513 | \n",
+ " Thomas Lee | \n",
+ " 2.0 | \n",
+ " United States | \n",
+ " private equity | \n",
+ " Finance & Investments | \n",
+ " 70-80 | \n",
+ "
\n",
+ " \n",
+ " 1646 | \n",
+ " 1645 | \n",
+ " Roberto Angelini Rossi | \n",
+ " 1.8 | \n",
+ " Chile | \n",
+ " forestry, mining | \n",
+ " diversified | \n",
+ " 70-80 | \n",
+ "
\n",
+ " \n",
+ " 376 | \n",
+ " 375 | \n",
+ " Patrick Drahi | \n",
+ " 6.6 | \n",
+ " France | \n",
+ " telecom | \n",
+ " Telecom | \n",
+ " 50-60 | \n",
+ "
\n",
+ " \n",
+ " 1894 | \n",
+ " 1818 | \n",
+ " Gerald Schwartz | \n",
+ " 1.6 | \n",
+ " Canada | \n",
+ " finance | \n",
+ " Finance & Investments | \n",
+ " 80+ | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
520 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Rank Name Networth Country \\\n",
+ "2075 2076 Radhe Shyam Agarwal 1.4 India \n",
+ "1529 1513 Robert Duggan 2.0 United States \n",
+ "1803 1729 Yao Kuizhang 1.7 China \n",
+ "425 424 Alexei Kuzmichev 6.0 Russia \n",
+ "2597 2578 Ramesh Genomal 1.0 Philippines \n",
+ "... ... ... ... ... \n",
+ "935 913 Alfred Oetker 3.3 Germany \n",
+ "1541 1513 Thomas Lee 2.0 United States \n",
+ "1646 1645 Roberto Angelini Rossi 1.8 Chile \n",
+ "376 375 Patrick Drahi 6.6 France \n",
+ "1894 1818 Gerald Schwartz 1.6 Canada \n",
+ "\n",
+ " Source Industry Age_category \n",
+ "2075 consumer goods Fashion & Retail 70-80 \n",
+ "1529 pharmaceuticals Healthcare 70-80 \n",
+ "1803 beverages Food & Beverage 50-60 \n",
+ "425 oil, banking, telecom Energy 50-60 \n",
+ "2597 apparel Fashion & Retail 70-80 \n",
+ "... ... ... ... \n",
+ "935 consumer goods Fashion & Retail 50-60 \n",
+ "1541 private equity Finance & Investments 70-80 \n",
+ "1646 forestry, mining diversified 70-80 \n",
+ "376 telecom Telecom 50-60 \n",
+ "1894 finance Finance & Investments 80+ \n",
+ "\n",
+ "[520 rows x 7 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'y_test'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Age_category | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2075 | \n",
+ " 70-80 | \n",
+ "
\n",
+ " \n",
+ " 1529 | \n",
+ " 70-80 | \n",
+ "
\n",
+ " \n",
+ " 1803 | \n",
+ " 50-60 | \n",
+ "
\n",
+ " \n",
+ " 425 | \n",
+ " 50-60 | \n",
+ "
\n",
+ " \n",
+ " 2597 | \n",
+ " 70-80 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 935 | \n",
+ " 50-60 | \n",
+ "
\n",
+ " \n",
+ " 1541 | \n",
+ " 70-80 | \n",
+ "
\n",
+ " \n",
+ " 1646 | \n",
+ " 70-80 | \n",
+ "
\n",
+ " \n",
+ " 376 | \n",
+ " 50-60 | \n",
+ "
\n",
+ " \n",
+ " 1894 | \n",
+ " 80+ | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
520 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Age_category\n",
+ "2075 70-80\n",
+ "1529 70-80\n",
+ "1803 50-60\n",
+ "425 50-60\n",
+ "2597 70-80\n",
+ "... ...\n",
+ "935 50-60\n",
+ "1541 70-80\n",
+ "1646 70-80\n",
+ "376 50-60\n",
+ "1894 80+\n",
+ "\n",
+ "[520 rows x 1 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
}
],
"source": [
- "from typing import Tuple\n",
- "import pandas as pd\n",
- "from pandas import DataFrame\n",
- "from sklearn.model_selection import train_test_split\n",
- "\n",
- "def split_stratified_into_train_val_test(\n",
- " df_input,\n",
- " stratify_colname=\"y\",\n",
- " frac_train=0.6,\n",
- " frac_val=0.15,\n",
- " frac_test=0.25,\n",
- " random_state=None,\n",
- ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
- " \n",
- " if frac_train + frac_val + frac_test != 1.0:\n",
- " raise ValueError(\n",
- " \"fractions %f, %f, %f do not add up to 1.0\"\n",
- " % (frac_train, frac_val, frac_test)\n",
- " )\n",
- " if stratify_colname not in df_input.columns:\n",
- " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
- " X = df_input # Contains all columns.\n",
- " y = df_input[\n",
- " [stratify_colname]\n",
- " ] # Dataframe of just the column on which to stratify.\n",
- " # Split original dataframe into train and temp dataframes.\n",
- " df_train, df_temp, y_train, y_temp = train_test_split(\n",
- " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
- " )\n",
- " if frac_val <= 0:\n",
- " assert len(df_input) == len(df_train) + len(df_temp)\n",
- " return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
- " # Split the temp dataframe into val and test dataframes.\n",
- " relative_frac_test = frac_test / (frac_val + frac_test)\n",
- " df_val, df_test, y_val, y_test = train_test_split(\n",
- " df_temp,\n",
- " y_temp,\n",
- " stratify=y_temp,\n",
- " test_size=relative_frac_test,\n",
- " random_state=random_state,\n",
- " )\n",
- " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
- " return df_train, df_val, df_test, y_train, y_val, y_test\n",
+ "from utils import split_stratified_into_train_val_test\n",
"\n",
"X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
- " df, stratify_colname=\"Age\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=None\n",
+ " df, stratify_colname=\"Age_category\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=9\n",
")\n",
"\n",
"display(\"X_train\", X_train)\n",
@@ -214,6 +723,135 @@
"display(\"X_test\", X_test)\n",
"display(\"y_test\", y_test)"
]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Формирование конвейера для классификации данных\n",
+ "## preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n",
+ "## preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n",
+ "## features_preprocessing -- трансформер для предобработки признаков\n",
+ "## features_engineering -- трансформер для конструирования признаков\n",
+ "## drop_columns -- трансформер для удаления колонок\n",
+ "## pipeline_end -- основной конвейер предобработки данных и конструирования признаков"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "from sklearn.compose import ColumnTransformer\n",
+ "from sklearn.discriminant_analysis import StandardScaler\n",
+ "from sklearn.impute import SimpleImputer\n",
+ "from sklearn.pipeline import Pipeline\n",
+ "from sklearn.preprocessing import OneHotEncoder\n",
+ "\n",
+ "\n",
+ "columns_to_drop = [\"Rank \", \"Name\"]\n",
+ "num_columns = [\n",
+ " column\n",
+ " for column in df.columns\n",
+ " if column not in columns_to_drop and df[column].dtype != \"object\"\n",
+ "]\n",
+ "cat_columns = [\n",
+ " column\n",
+ " for column in df.columns\n",
+ " if column not in columns_to_drop and df[column].dtype == \"object\"\n",
+ "]\n",
+ "\n",
+ "num_imputer = SimpleImputer(strategy=\"median\")\n",
+ "num_scaler = StandardScaler()\n",
+ "preprocessing_num = Pipeline(\n",
+ " [\n",
+ " (\"imputer\", num_imputer),\n",
+ " (\"scaler\", num_scaler),\n",
+ " ]\n",
+ ")\n",
+ "\n",
+ "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n",
+ "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
+ "preprocessing_cat = Pipeline(\n",
+ " [\n",
+ " (\"imputer\", cat_imputer),\n",
+ " (\"encoder\", cat_encoder),\n",
+ " ]\n",
+ ")\n",
+ "\n",
+ "features_preprocessing = ColumnTransformer(\n",
+ " verbose_feature_names_out=False,\n",
+ " transformers=[\n",
+ " (\"prepocessing_num\", preprocessing_num, num_columns),\n",
+ " (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
+ " (\"prepocessing_features\", cat_imputer),\n",
+ " ],\n",
+ " remainder=\"passthrough\"\n",
+ ")\n",
+ "\n",
+ "\n",
+ "drop_columns = ColumnTransformer(\n",
+ " verbose_feature_names_out=False,\n",
+ " transformers=[\n",
+ " (\"drop_columns\", \"drop\", columns_to_drop),\n",
+ " ],\n",
+ " remainder=\"passthrough\",\n",
+ ")\n",
+ "\n",
+ "features_postprocessing = ColumnTransformer(\n",
+ " verbose_feature_names_out=False,\n",
+ " transformers=[\n",
+ " (\"prepocessing_cat\", preprocessing_cat, [\"Cabin_type\"]),\n",
+ " ],\n",
+ " remainder=\"passthrough\",\n",
+ ")\n",
+ "\n",
+ "pipeline_end = Pipeline(\n",
+ " [\n",
+ " (\"features_preprocessing\", features_preprocessing),\n",
+ " (\"drop_columns\", drop_columns),\n",
+ " (\"features_postprocessing\", features_postprocessing),\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "ValueError",
+ "evalue": "not enough values to unpack (expected 3, got 2)",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[1;32mIn[31], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m preprocessing_result \u001b[38;5;241m=\u001b[39m \u001b[43mpipeline_end\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m preprocessed_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(\n\u001b[0;32m 3\u001b[0m preprocessing_result,\n\u001b[0;32m 4\u001b[0m columns\u001b[38;5;241m=\u001b[39mpipeline_end\u001b[38;5;241m.\u001b[39mget_feature_names_out(),\n\u001b[0;32m 5\u001b[0m )\n\u001b[0;32m 7\u001b[0m preprocessed_df\n",
+ "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py:533\u001b[0m, in \u001b[0;36mPipeline.fit_transform\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 490\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Fit the model and transform with the final estimator.\u001b[39;00m\n\u001b[0;32m 491\u001b[0m \n\u001b[0;32m 492\u001b[0m \u001b[38;5;124;03mFit all the transformers one after the other and sequentially transform\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 530\u001b[0m \u001b[38;5;124;03m Transformed samples.\u001b[39;00m\n\u001b[0;32m 531\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 532\u001b[0m routed_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_method_params(method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m, props\u001b[38;5;241m=\u001b[39mparams)\n\u001b[1;32m--> 533\u001b[0m Xt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrouted_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 535\u001b[0m last_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_final_estimator\n\u001b[0;32m 536\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_log_message(\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m)):\n",
+ "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py:406\u001b[0m, in \u001b[0;36mPipeline._fit\u001b[1;34m(self, X, y, routed_params)\u001b[0m\n\u001b[0;32m 404\u001b[0m cloned_transformer \u001b[38;5;241m=\u001b[39m clone(transformer)\n\u001b[0;32m 405\u001b[0m \u001b[38;5;66;03m# Fit or load from cache the current transformer\u001b[39;00m\n\u001b[1;32m--> 406\u001b[0m X, fitted_transformer \u001b[38;5;241m=\u001b[39m \u001b[43mfit_transform_one_cached\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 407\u001b[0m \u001b[43m \u001b[49m\u001b[43mcloned_transformer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 408\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 409\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 410\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 411\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage_clsname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPipeline\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 412\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_message\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstep_idx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 413\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrouted_params\u001b[49m\u001b[43m[\u001b[49m\u001b[43mname\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 414\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 415\u001b[0m \u001b[38;5;66;03m# Replace the transformer of the step with the fitted\u001b[39;00m\n\u001b[0;32m 416\u001b[0m \u001b[38;5;66;03m# transformer. This is necessary when loading the transformer\u001b[39;00m\n\u001b[0;32m 417\u001b[0m \u001b[38;5;66;03m# from the cache.\u001b[39;00m\n\u001b[0;32m 418\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps[step_idx] \u001b[38;5;241m=\u001b[39m (name, fitted_transformer)\n",
+ "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\joblib\\memory.py:312\u001b[0m, in \u001b[0;36mNotMemorizedFunc.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 311\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 312\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py:1310\u001b[0m, in \u001b[0;36m_fit_transform_one\u001b[1;34m(transformer, X, y, weight, message_clsname, message, params)\u001b[0m\n\u001b[0;32m 1308\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(message_clsname, message):\n\u001b[0;32m 1309\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(transformer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m-> 1310\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mtransformer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfit_transform\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1311\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1312\u001b[0m res \u001b[38;5;241m=\u001b[39m transformer\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit\u001b[39m\u001b[38;5;124m\"\u001b[39m, {}))\u001b[38;5;241m.\u001b[39mtransform(\n\u001b[0;32m 1313\u001b[0m X, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtransform\u001b[39m\u001b[38;5;124m\"\u001b[39m, {})\n\u001b[0;32m 1314\u001b[0m )\n",
+ "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\utils\\_set_output.py:316\u001b[0m, in \u001b[0;36m_wrap_method_output..wrapped\u001b[1;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[0;32m 314\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[0;32m 315\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 316\u001b[0m data_to_wrap \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[0;32m 318\u001b[0m \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[0;32m 319\u001b[0m return_tuple \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 320\u001b[0m _wrap_data_with_container(method, data_to_wrap[\u001b[38;5;241m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[0;32m 321\u001b[0m \u001b[38;5;241m*\u001b[39mdata_to_wrap[\u001b[38;5;241m1\u001b[39m:],\n\u001b[0;32m 322\u001b[0m )\n",
+ "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:965\u001b[0m, in \u001b[0;36mColumnTransformer.fit_transform\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 963\u001b[0m \u001b[38;5;66;03m# set n_features_in_ attribute\u001b[39;00m\n\u001b[0;32m 964\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_n_features(X, reset\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m--> 965\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_transformers\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 966\u001b[0m n_samples \u001b[38;5;241m=\u001b[39m _num_samples(X)\n\u001b[0;32m 968\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_column_callables(X)\n",
+ "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:501\u001b[0m, in \u001b[0;36mColumnTransformer._validate_transformers\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformers:\n\u001b[0;32m 499\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m--> 501\u001b[0m names, transformers, _ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mzip\u001b[39m(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformers)\n\u001b[0;32m 503\u001b[0m \u001b[38;5;66;03m# validate names\u001b[39;00m\n\u001b[0;32m 504\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_names(names)\n",
+ "\u001b[1;31mValueError\u001b[0m: not enough values to unpack (expected 3, got 2)"
+ ]
+ }
+ ],
+ "source": [
+ "preprocessing_result = pipeline_end.fit_transform(X_train)\n",
+ "preprocessed_df = pd.DataFrame(\n",
+ " preprocessing_result,\n",
+ " columns=pipeline_end.get_feature_names_out(),\n",
+ ")\n",
+ "\n",
+ "preprocessed_df"
+ ]
}
],
"metadata": {