diff --git a/lab_4/lab4.ipynb b/lab_4/lab4.ipynb index f016a48..ba7919c 100644 --- a/lab_4/lab4.ipynb +++ b/lab_4/lab4.ipynb @@ -33,7 +33,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Подготовим данные: удалим колонки rank и name(в них уникальные значения, которые не участвуют в предсказаниях). А также преобразуем номинальные колонки в числовые(country, source, industry) и категоризируем колонку age" + "# Подготовим данные: категоризируем колонку age" ] }, { @@ -45,167 +45,676 @@ "name": "stdout", "output_type": "stream", "text": [ - " Networth Age Country Source Industry\n", - "0 219.0 50 United States Tesla, SpaceX Automotive \n", - "1 171.0 58 United States Amazon Technology \n", - "2 158.0 73 France LVMH Fashion & Retail \n", - "3 129.0 66 United States Microsoft Technology \n", - "4 118.0 91 United States Berkshire Hathaway Finance & Investments \n" + "Rank 0\n", + "Name 0\n", + "Networth 0\n", + "Age 0\n", + "Country 0\n", + "Source 0\n", + "Industry 0\n", + "dtype: int64\n", + "\n", + "Rank False\n", + "Name False\n", + "Networth False\n", + "Age False\n", + "Country False\n", + "Source False\n", + "Industry False\n", + "dtype: bool\n", + "\n" ] } ], "source": [ - "# Удаление колонок 'rank' и 'name'\n", - "df.drop(columns=['Rank ', 'Name'], inplace=True)\n", + "print(df.isnull().sum())\n", "\n", - "# Проверка, что колонки были удалены\n", - "print(df.head())" + "print()\n", + "\n", + "# Есть ли пустые значения признаков\n", + "print(df.isnull().any())\n", + "\n", + "print()\n", + "\n", + "# Процент пустых значений признаков\n", + "for i in df.columns:\n", + " null_rate = df[i].isnull().sum() / len(df) * 100\n", + " if null_rate > 0:\n", + " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " Networth Country_Argentina Country_Australia Country_Austria \\\n", - "0 219.0 False False False \n", - "1 171.0 False False False \n", - "2 158.0 False False False \n", - "3 129.0 False False False \n", - "4 118.0 False False False \n", + " Rank Name Networth Country \\\n", + "0 1 Elon Musk 219.0 United States \n", + "1 2 Jeff Bezos 171.0 United States \n", + "2 3 Bernard Arnault & family 158.0 France \n", + "3 4 Bill Gates 129.0 United States \n", + "4 5 Warren Buffett 118.0 United States \n", "\n", - " Country_Barbados Country_Belgium Country_Belize Country_Brazil \\\n", - "0 False False False False \n", - "1 False False False False \n", - "2 False False False False \n", - "3 False False False False \n", - "4 False False False False \n", - "\n", - " Country_Bulgaria Country_Canada ... wind wine winter wire wireless \\\n", - "0 False False ... 0.0 0.0 0.0 0.0 0.0 \n", - "1 False False ... 0.0 0.0 0.0 0.0 0.0 \n", - "2 False False ... 0.0 0.0 0.0 0.0 0.0 \n", - "3 False False ... 0.0 0.0 0.0 0.0 0.0 \n", - "4 False False ... 0.0 0.0 0.0 0.0 0.0 \n", - "\n", - " yahoo yogurt zara zoom Age \n", - "0 0.0 0.0 0.0 0.0 50-60 \n", - "1 0.0 0.0 0.0 0.0 50-60 \n", - "2 0.0 0.0 0.0 0.0 70-80 \n", - "3 0.0 0.0 0.0 0.0 60-70 \n", - "4 0.0 0.0 0.0 0.0 80+ \n", - "\n", - "[5 rows x 828 columns]\n" + " Source Industry Age_category \n", + "0 Tesla, SpaceX Automotive 50-60 \n", + "1 Amazon Technology 50-60 \n", + "2 LVMH Fashion & Retail 70-80 \n", + "3 Microsoft Technology 60-70 \n", + "4 Berkshire Hathaway Finance & Investments 80+ \n" ] } ], "source": [ - "from sklearn.preprocessing import OneHotEncoder\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", - "# Преобразуем 'country' и 'industry' в бинарные матрицы с помощью One-Hot Encoding\n", - "df_country = pd.get_dummies(df[['Country']], drop_first=True)\n", - "df_industry = pd.get_dummies(df[['Industry']], drop_first=True)\n", "\n", - "# Преобразуем колонку 'source' с помощью TF-IDF\n", - "tfidf_vectorizer = TfidfVectorizer(max_features=1000) \n", - "X_tfidf = tfidf_vectorizer.fit_transform(df['Source']).toarray()\n", - "\n", - "# Создаем DataFrame с результатами TF-IDF\n", - "df_source_tfidf = pd.DataFrame(X_tfidf, columns=tfidf_vectorizer.get_feature_names_out())\n", - "\n", - "bins = [0, 30, 40, 50, 60, 70, 80, 100] # границы для возрастных категорий\n", + "bins = [0, 30, 40, 50, 60, 70, 80, 101] # границы для возрастных категорий\n", "labels = ['Under 30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'] # метки для категорий\n", "\n", - "# Создаем новую колонку 'age_group', где будет храниться категория\n", - "df_age_group = pd.cut(df['Age'], bins=bins, labels=labels, right=False)\n", - "\n", - "\n", + "df[\"Age_category\"] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)\n", "# Удаляем оригинальные колонки 'country', 'industry' и 'source' из исходного DataFrame\n", - "df.drop(columns=['Country', 'Industry', 'Source', 'Age'], inplace=True)\n", - "\n", - "# Объединяем все преобразованные данные в один DataFrame\n", - "df_transformed = pd.concat([df, df_country, df_industry, df_source_tfidf, df_age_group], axis=1)\n", + "df.drop(columns=['Age'], inplace=True)\n", "\n", "# Просмотр результата\n", - "print(df_transformed.head())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Разобьём набор данных на обучающую и тестовые выборки (80/20) для задачи классификации. Целевой признак- Age" + "print(df.head())" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, "outputs": [ { - "ename": "NameError", - "evalue": "name 'df_transformed' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[18], line 46\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(df_input) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_train) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_val) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_test)\n\u001b[0;32m 43\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test, y_train, y_val, y_test\n\u001b[0;32m 45\u001b[0m X_train, X_val, X_test, y_train, y_val, y_test \u001b[38;5;241m=\u001b[39m split_stratified_into_train_val_test(\n\u001b[1;32m---> 46\u001b[0m \u001b[43mdf_transformed\u001b[49m, stratify_colname\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAge\u001b[39m\u001b[38;5;124m\"\u001b[39m, frac_train\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.80\u001b[39m, frac_val\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m, frac_test\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.20\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 47\u001b[0m )\n\u001b[0;32m 49\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, X_train)\n\u001b[0;32m 50\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, y_train)\n", - "\u001b[1;31mNameError\u001b[0m: name 'df_transformed' is not defined" - ] + "data": { + "text/plain": [ + "'X_train'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RankNameNetworthCountrySourceIndustryAge_category
19091818Tran Ba Duong & family1.6VietnamautomotiveAutomotive60-70
20992076Mark Dixon1.4United Kingdomoffice real estateReal Estate60-70
13921341Yingzhuo Xu2.3ChinaagribusinessFood & Beverage50-60
627622Bruce Flatt4.6Canadamoney managementFinance & Investments50-60
527523Li Liangbin5.2ChinalithiumManufacturing50-60
........................
8485Theo Albrecht, Jr. & family18.7GermanyAldi, Trader Joe'sFashion & Retail70-80
633622Tony Tamer4.6United Statesprivate equityFinance & Investments60-70
922913Bob Gaglardi3.3CanadahotelsReal Estate80+
21782076Eugene Wu1.4TaiwanfinanceFinance & Investments70-80
415411Leonard Stern6.2United Statesreal estateReal Estate80+
\n", + "

2080 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " Rank Name Networth Country \\\n", + "1909 1818 Tran Ba Duong & family 1.6 Vietnam \n", + "2099 2076 Mark Dixon 1.4 United Kingdom \n", + "1392 1341 Yingzhuo Xu 2.3 China \n", + "627 622 Bruce Flatt 4.6 Canada \n", + "527 523 Li Liangbin 5.2 China \n", + "... ... ... ... ... \n", + "84 85 Theo Albrecht, Jr. & family 18.7 Germany \n", + "633 622 Tony Tamer 4.6 United States \n", + "922 913 Bob Gaglardi 3.3 Canada \n", + "2178 2076 Eugene Wu 1.4 Taiwan \n", + "415 411 Leonard Stern 6.2 United States \n", + "\n", + " Source Industry Age_category \n", + "1909 automotive Automotive 60-70 \n", + "2099 office real estate Real Estate 60-70 \n", + "1392 agribusiness Food & Beverage 50-60 \n", + "627 money management Finance & Investments 50-60 \n", + "527 lithium Manufacturing 50-60 \n", + "... ... ... ... \n", + "84 Aldi, Trader Joe's Fashion & Retail 70-80 \n", + "633 private equity Finance & Investments 60-70 \n", + "922 hotels Real Estate 80+ \n", + "2178 finance Finance & Investments 70-80 \n", + "415 real estate Real Estate 80+ \n", + "\n", + "[2080 rows x 7 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'y_train'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Age_category
190960-70
209960-70
139250-60
62750-60
52750-60
......
8470-80
63360-70
92280+
217870-80
41580+
\n", + "

2080 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " Age_category\n", + "1909 60-70\n", + "2099 60-70\n", + "1392 50-60\n", + "627 50-60\n", + "527 50-60\n", + "... ...\n", + "84 70-80\n", + "633 60-70\n", + "922 80+\n", + "2178 70-80\n", + "415 80+\n", + "\n", + "[2080 rows x 1 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'X_test'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RankNameNetworthCountrySourceIndustryAge_category
20752076Radhe Shyam Agarwal1.4Indiaconsumer goodsFashion & Retail70-80
15291513Robert Duggan2.0United StatespharmaceuticalsHealthcare70-80
18031729Yao Kuizhang1.7ChinabeveragesFood & Beverage50-60
425424Alexei Kuzmichev6.0Russiaoil, banking, telecomEnergy50-60
25972578Ramesh Genomal1.0PhilippinesapparelFashion & Retail70-80
........................
935913Alfred Oetker3.3Germanyconsumer goodsFashion & Retail50-60
15411513Thomas Lee2.0United Statesprivate equityFinance & Investments70-80
16461645Roberto Angelini Rossi1.8Chileforestry, miningdiversified70-80
376375Patrick Drahi6.6FrancetelecomTelecom50-60
18941818Gerald Schwartz1.6CanadafinanceFinance & Investments80+
\n", + "

520 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " Rank Name Networth Country \\\n", + "2075 2076 Radhe Shyam Agarwal 1.4 India \n", + "1529 1513 Robert Duggan 2.0 United States \n", + "1803 1729 Yao Kuizhang 1.7 China \n", + "425 424 Alexei Kuzmichev 6.0 Russia \n", + "2597 2578 Ramesh Genomal 1.0 Philippines \n", + "... ... ... ... ... \n", + "935 913 Alfred Oetker 3.3 Germany \n", + "1541 1513 Thomas Lee 2.0 United States \n", + "1646 1645 Roberto Angelini Rossi 1.8 Chile \n", + "376 375 Patrick Drahi 6.6 France \n", + "1894 1818 Gerald Schwartz 1.6 Canada \n", + "\n", + " Source Industry Age_category \n", + "2075 consumer goods Fashion & Retail 70-80 \n", + "1529 pharmaceuticals Healthcare 70-80 \n", + "1803 beverages Food & Beverage 50-60 \n", + "425 oil, banking, telecom Energy 50-60 \n", + "2597 apparel Fashion & Retail 70-80 \n", + "... ... ... ... \n", + "935 consumer goods Fashion & Retail 50-60 \n", + "1541 private equity Finance & Investments 70-80 \n", + "1646 forestry, mining diversified 70-80 \n", + "376 telecom Telecom 50-60 \n", + "1894 finance Finance & Investments 80+ \n", + "\n", + "[520 rows x 7 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'y_test'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Age_category
207570-80
152970-80
180350-60
42550-60
259770-80
......
93550-60
154170-80
164670-80
37650-60
189480+
\n", + "

520 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " Age_category\n", + "2075 70-80\n", + "1529 70-80\n", + "1803 50-60\n", + "425 50-60\n", + "2597 70-80\n", + "... ...\n", + "935 50-60\n", + "1541 70-80\n", + "1646 70-80\n", + "376 50-60\n", + "1894 80+\n", + "\n", + "[520 rows x 1 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "from typing import Tuple\n", - "import pandas as pd\n", - "from pandas import DataFrame\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "def split_stratified_into_train_val_test(\n", - " df_input,\n", - " stratify_colname=\"y\",\n", - " frac_train=0.6,\n", - " frac_val=0.15,\n", - " frac_test=0.25,\n", - " random_state=None,\n", - ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n", - " \n", - " if frac_train + frac_val + frac_test != 1.0:\n", - " raise ValueError(\n", - " \"fractions %f, %f, %f do not add up to 1.0\"\n", - " % (frac_train, frac_val, frac_test)\n", - " )\n", - " if stratify_colname not in df_input.columns:\n", - " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", - " X = df_input # Contains all columns.\n", - " y = df_input[\n", - " [stratify_colname]\n", - " ] # Dataframe of just the column on which to stratify.\n", - " # Split original dataframe into train and temp dataframes.\n", - " df_train, df_temp, y_train, y_temp = train_test_split(\n", - " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", - " )\n", - " if frac_val <= 0:\n", - " assert len(df_input) == len(df_train) + len(df_temp)\n", - " return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n", - " # Split the temp dataframe into val and test dataframes.\n", - " relative_frac_test = frac_test / (frac_val + frac_test)\n", - " df_val, df_test, y_val, y_test = train_test_split(\n", - " df_temp,\n", - " y_temp,\n", - " stratify=y_temp,\n", - " test_size=relative_frac_test,\n", - " random_state=random_state,\n", - " )\n", - " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", - " return df_train, df_val, df_test, y_train, y_val, y_test\n", + "from utils import split_stratified_into_train_val_test\n", "\n", "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n", - " df, stratify_colname=\"Age\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=None\n", + " df, stratify_colname=\"Age_category\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=9\n", ")\n", "\n", "display(\"X_train\", X_train)\n", @@ -214,6 +723,135 @@ "display(\"X_test\", X_test)\n", "display(\"y_test\", y_test)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Формирование конвейера для классификации данных\n", + "## preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n", + "## preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n", + "## features_preprocessing -- трансформер для предобработки признаков\n", + "## features_engineering -- трансформер для конструирования признаков\n", + "## drop_columns -- трансформер для удаления колонок\n", + "## pipeline_end -- основной конвейер предобработки данных и конструирования признаков" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.discriminant_analysis import StandardScaler\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "\n", + "\n", + "columns_to_drop = [\"Rank \", \"Name\"]\n", + "num_columns = [\n", + " column\n", + " for column in df.columns\n", + " if column not in columns_to_drop and df[column].dtype != \"object\"\n", + "]\n", + "cat_columns = [\n", + " column\n", + " for column in df.columns\n", + " if column not in columns_to_drop and df[column].dtype == \"object\"\n", + "]\n", + "\n", + "num_imputer = SimpleImputer(strategy=\"median\")\n", + "num_scaler = StandardScaler()\n", + "preprocessing_num = Pipeline(\n", + " [\n", + " (\"imputer\", num_imputer),\n", + " (\"scaler\", num_scaler),\n", + " ]\n", + ")\n", + "\n", + "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n", + "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n", + "preprocessing_cat = Pipeline(\n", + " [\n", + " (\"imputer\", cat_imputer),\n", + " (\"encoder\", cat_encoder),\n", + " ]\n", + ")\n", + "\n", + "features_preprocessing = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"prepocessing_num\", preprocessing_num, num_columns),\n", + " (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n", + " (\"prepocessing_features\", cat_imputer),\n", + " ],\n", + " remainder=\"passthrough\"\n", + ")\n", + "\n", + "\n", + "drop_columns = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"drop_columns\", \"drop\", columns_to_drop),\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "\n", + "features_postprocessing = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"prepocessing_cat\", preprocessing_cat, [\"Cabin_type\"]),\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "\n", + "pipeline_end = Pipeline(\n", + " [\n", + " (\"features_preprocessing\", features_preprocessing),\n", + " (\"drop_columns\", drop_columns),\n", + " (\"features_postprocessing\", features_postprocessing),\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "not enough values to unpack (expected 3, got 2)", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[31], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m preprocessing_result \u001b[38;5;241m=\u001b[39m \u001b[43mpipeline_end\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m preprocessed_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(\n\u001b[0;32m 3\u001b[0m preprocessing_result,\n\u001b[0;32m 4\u001b[0m columns\u001b[38;5;241m=\u001b[39mpipeline_end\u001b[38;5;241m.\u001b[39mget_feature_names_out(),\n\u001b[0;32m 5\u001b[0m )\n\u001b[0;32m 7\u001b[0m preprocessed_df\n", + "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py:533\u001b[0m, in \u001b[0;36mPipeline.fit_transform\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 490\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Fit the model and transform with the final estimator.\u001b[39;00m\n\u001b[0;32m 491\u001b[0m \n\u001b[0;32m 492\u001b[0m \u001b[38;5;124;03mFit all the transformers one after the other and sequentially transform\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 530\u001b[0m \u001b[38;5;124;03m Transformed samples.\u001b[39;00m\n\u001b[0;32m 531\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 532\u001b[0m routed_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_method_params(method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m, props\u001b[38;5;241m=\u001b[39mparams)\n\u001b[1;32m--> 533\u001b[0m Xt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrouted_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 535\u001b[0m last_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_final_estimator\n\u001b[0;32m 536\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_log_message(\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m)):\n", + "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py:406\u001b[0m, in \u001b[0;36mPipeline._fit\u001b[1;34m(self, X, y, routed_params)\u001b[0m\n\u001b[0;32m 404\u001b[0m cloned_transformer \u001b[38;5;241m=\u001b[39m clone(transformer)\n\u001b[0;32m 405\u001b[0m \u001b[38;5;66;03m# Fit or load from cache the current transformer\u001b[39;00m\n\u001b[1;32m--> 406\u001b[0m X, fitted_transformer \u001b[38;5;241m=\u001b[39m \u001b[43mfit_transform_one_cached\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 407\u001b[0m \u001b[43m \u001b[49m\u001b[43mcloned_transformer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 408\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 409\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 410\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 411\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage_clsname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPipeline\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 412\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_message\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstep_idx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 413\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrouted_params\u001b[49m\u001b[43m[\u001b[49m\u001b[43mname\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 414\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 415\u001b[0m \u001b[38;5;66;03m# Replace the transformer of the step with the fitted\u001b[39;00m\n\u001b[0;32m 416\u001b[0m \u001b[38;5;66;03m# transformer. This is necessary when loading the transformer\u001b[39;00m\n\u001b[0;32m 417\u001b[0m \u001b[38;5;66;03m# from the cache.\u001b[39;00m\n\u001b[0;32m 418\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps[step_idx] \u001b[38;5;241m=\u001b[39m (name, fitted_transformer)\n", + "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\joblib\\memory.py:312\u001b[0m, in \u001b[0;36mNotMemorizedFunc.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 311\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 312\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py:1310\u001b[0m, in \u001b[0;36m_fit_transform_one\u001b[1;34m(transformer, X, y, weight, message_clsname, message, params)\u001b[0m\n\u001b[0;32m 1308\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(message_clsname, message):\n\u001b[0;32m 1309\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(transformer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m-> 1310\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mtransformer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfit_transform\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1311\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1312\u001b[0m res \u001b[38;5;241m=\u001b[39m transformer\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit\u001b[39m\u001b[38;5;124m\"\u001b[39m, {}))\u001b[38;5;241m.\u001b[39mtransform(\n\u001b[0;32m 1313\u001b[0m X, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtransform\u001b[39m\u001b[38;5;124m\"\u001b[39m, {})\n\u001b[0;32m 1314\u001b[0m )\n", + "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\utils\\_set_output.py:316\u001b[0m, in \u001b[0;36m_wrap_method_output..wrapped\u001b[1;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[0;32m 314\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[0;32m 315\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 316\u001b[0m data_to_wrap \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[0;32m 318\u001b[0m \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[0;32m 319\u001b[0m return_tuple \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 320\u001b[0m _wrap_data_with_container(method, data_to_wrap[\u001b[38;5;241m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[0;32m 321\u001b[0m \u001b[38;5;241m*\u001b[39mdata_to_wrap[\u001b[38;5;241m1\u001b[39m:],\n\u001b[0;32m 322\u001b[0m )\n", + "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:965\u001b[0m, in \u001b[0;36mColumnTransformer.fit_transform\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 963\u001b[0m \u001b[38;5;66;03m# set n_features_in_ attribute\u001b[39;00m\n\u001b[0;32m 964\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_n_features(X, reset\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m--> 965\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_transformers\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 966\u001b[0m n_samples \u001b[38;5;241m=\u001b[39m _num_samples(X)\n\u001b[0;32m 968\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_column_callables(X)\n", + "File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:501\u001b[0m, in \u001b[0;36mColumnTransformer._validate_transformers\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformers:\n\u001b[0;32m 499\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m--> 501\u001b[0m names, transformers, _ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mzip\u001b[39m(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformers)\n\u001b[0;32m 503\u001b[0m \u001b[38;5;66;03m# validate names\u001b[39;00m\n\u001b[0;32m 504\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_names(names)\n", + "\u001b[1;31mValueError\u001b[0m: not enough values to unpack (expected 3, got 2)" + ] + } + ], + "source": [ + "preprocessing_result = pipeline_end.fit_transform(X_train)\n", + "preprocessed_df = pd.DataFrame(\n", + " preprocessing_result,\n", + " columns=pipeline_end.get_feature_names_out(),\n", + ")\n", + "\n", + "preprocessed_df" + ] } ], "metadata": {