коммит2

This commit is contained in:
annalyovushkina@yandex.ru 2024-11-15 22:37:33 +04:00
parent 41c4ab91ed
commit 8fc280c7e7

View File

@ -33,7 +33,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Подготовим данные: удалим колонки rank и name(в них уникальные значения, которые не участвуют в предсказаниях). А также преобразуем номинальные колонки в числовые(country, source, industry) и категоризируем колонку age" "# Подготовим данные: категоризируем колонку age"
] ]
}, },
{ {
@ -45,167 +45,676 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
" Networth Age Country Source Industry\n", "Rank 0\n",
"0 219.0 50 United States Tesla, SpaceX Automotive \n", "Name 0\n",
"1 171.0 58 United States Amazon Technology \n", "Networth 0\n",
"2 158.0 73 France LVMH Fashion & Retail \n", "Age 0\n",
"3 129.0 66 United States Microsoft Technology \n", "Country 0\n",
"4 118.0 91 United States Berkshire Hathaway Finance & Investments \n" "Source 0\n",
"Industry 0\n",
"dtype: int64\n",
"\n",
"Rank False\n",
"Name False\n",
"Networth False\n",
"Age False\n",
"Country False\n",
"Source False\n",
"Industry False\n",
"dtype: bool\n",
"\n"
] ]
} }
], ],
"source": [ "source": [
"# Удаление колонок 'rank' и 'name'\n", "print(df.isnull().sum())\n",
"df.drop(columns=['Rank ', 'Name'], inplace=True)\n",
"\n", "\n",
"# Проверка, что колонки были удалены\n", "print()\n",
"print(df.head())" "\n",
"# Есть ли пустые значения признаков\n",
"print(df.isnull().any())\n",
"\n",
"print()\n",
"\n",
"# Процент пустых значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
" Networth Country_Argentina Country_Australia Country_Austria \\\n", " Rank Name Networth Country \\\n",
"0 219.0 False False False \n", "0 1 Elon Musk 219.0 United States \n",
"1 171.0 False False False \n", "1 2 Jeff Bezos 171.0 United States \n",
"2 158.0 False False False \n", "2 3 Bernard Arnault & family 158.0 France \n",
"3 129.0 False False False \n", "3 4 Bill Gates 129.0 United States \n",
"4 118.0 False False False \n", "4 5 Warren Buffett 118.0 United States \n",
"\n", "\n",
" Country_Barbados Country_Belgium Country_Belize Country_Brazil \\\n", " Source Industry Age_category \n",
"0 False False False False \n", "0 Tesla, SpaceX Automotive 50-60 \n",
"1 False False False False \n", "1 Amazon Technology 50-60 \n",
"2 False False False False \n", "2 LVMH Fashion & Retail 70-80 \n",
"3 False False False False \n", "3 Microsoft Technology 60-70 \n",
"4 False False False False \n", "4 Berkshire Hathaway Finance & Investments 80+ \n"
"\n",
" Country_Bulgaria Country_Canada ... wind wine winter wire wireless \\\n",
"0 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
"1 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
"2 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
"3 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
"4 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
"\n",
" yahoo yogurt zara zoom Age \n",
"0 0.0 0.0 0.0 0.0 50-60 \n",
"1 0.0 0.0 0.0 0.0 50-60 \n",
"2 0.0 0.0 0.0 0.0 70-80 \n",
"3 0.0 0.0 0.0 0.0 60-70 \n",
"4 0.0 0.0 0.0 0.0 80+ \n",
"\n",
"[5 rows x 828 columns]\n"
] ]
} }
], ],
"source": [ "source": [
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n", "\n",
"# Преобразуем 'country' и 'industry' в бинарные матрицы с помощью One-Hot Encoding\n",
"df_country = pd.get_dummies(df[['Country']], drop_first=True)\n",
"df_industry = pd.get_dummies(df[['Industry']], drop_first=True)\n",
"\n", "\n",
"# Преобразуем колонку 'source' с помощью TF-IDF\n", "bins = [0, 30, 40, 50, 60, 70, 80, 101] # границы для возрастных категорий\n",
"tfidf_vectorizer = TfidfVectorizer(max_features=1000) \n",
"X_tfidf = tfidf_vectorizer.fit_transform(df['Source']).toarray()\n",
"\n",
"# Создаем DataFrame с результатами TF-IDF\n",
"df_source_tfidf = pd.DataFrame(X_tfidf, columns=tfidf_vectorizer.get_feature_names_out())\n",
"\n",
"bins = [0, 30, 40, 50, 60, 70, 80, 100] # границы для возрастных категорий\n",
"labels = ['Under 30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'] # метки для категорий\n", "labels = ['Under 30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'] # метки для категорий\n",
"\n", "\n",
"# Создаем новую колонку 'age_group', где будет храниться категория\n", "df[\"Age_category\"] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)\n",
"df_age_group = pd.cut(df['Age'], bins=bins, labels=labels, right=False)\n",
"\n",
"\n",
"# Удаляем оригинальные колонки 'country', 'industry' и 'source' из исходного DataFrame\n", "# Удаляем оригинальные колонки 'country', 'industry' и 'source' из исходного DataFrame\n",
"df.drop(columns=['Country', 'Industry', 'Source', 'Age'], inplace=True)\n", "df.drop(columns=['Age'], inplace=True)\n",
"\n",
"# Объединяем все преобразованные данные в один DataFrame\n",
"df_transformed = pd.concat([df, df_country, df_industry, df_source_tfidf, df_age_group], axis=1)\n",
"\n", "\n",
"# Просмотр результата\n", "# Просмотр результата\n",
"print(df_transformed.head())" "print(df.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Разобьём набор данных на обучающую и тестовые выборки (80/20) для задачи классификации. Целевой признак- Age"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 27,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"ename": "NameError", "data": {
"evalue": "name 'df_transformed' is not defined", "text/plain": [
"output_type": "error", "'X_train'"
"traceback": [ ]
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", },
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", "metadata": {},
"Cell \u001b[1;32mIn[18], line 46\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(df_input) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_train) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_val) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_test)\n\u001b[0;32m 43\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test, y_train, y_val, y_test\n\u001b[0;32m 45\u001b[0m X_train, X_val, X_test, y_train, y_val, y_test \u001b[38;5;241m=\u001b[39m split_stratified_into_train_val_test(\n\u001b[1;32m---> 46\u001b[0m \u001b[43mdf_transformed\u001b[49m, stratify_colname\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAge\u001b[39m\u001b[38;5;124m\"\u001b[39m, frac_train\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.80\u001b[39m, frac_val\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m, frac_test\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.20\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 47\u001b[0m )\n\u001b[0;32m 49\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, X_train)\n\u001b[0;32m 50\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, y_train)\n", "output_type": "display_data"
"\u001b[1;31mNameError\u001b[0m: name 'df_transformed' is not defined" },
] {
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Rank</th>\n",
" <th>Name</th>\n",
" <th>Networth</th>\n",
" <th>Country</th>\n",
" <th>Source</th>\n",
" <th>Industry</th>\n",
" <th>Age_category</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1909</th>\n",
" <td>1818</td>\n",
" <td>Tran Ba Duong &amp; family</td>\n",
" <td>1.6</td>\n",
" <td>Vietnam</td>\n",
" <td>automotive</td>\n",
" <td>Automotive</td>\n",
" <td>60-70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2099</th>\n",
" <td>2076</td>\n",
" <td>Mark Dixon</td>\n",
" <td>1.4</td>\n",
" <td>United Kingdom</td>\n",
" <td>office real estate</td>\n",
" <td>Real Estate</td>\n",
" <td>60-70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1392</th>\n",
" <td>1341</td>\n",
" <td>Yingzhuo Xu</td>\n",
" <td>2.3</td>\n",
" <td>China</td>\n",
" <td>agribusiness</td>\n",
" <td>Food &amp; Beverage</td>\n",
" <td>50-60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>627</th>\n",
" <td>622</td>\n",
" <td>Bruce Flatt</td>\n",
" <td>4.6</td>\n",
" <td>Canada</td>\n",
" <td>money management</td>\n",
" <td>Finance &amp; Investments</td>\n",
" <td>50-60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>527</th>\n",
" <td>523</td>\n",
" <td>Li Liangbin</td>\n",
" <td>5.2</td>\n",
" <td>China</td>\n",
" <td>lithium</td>\n",
" <td>Manufacturing</td>\n",
" <td>50-60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>84</th>\n",
" <td>85</td>\n",
" <td>Theo Albrecht, Jr. &amp; family</td>\n",
" <td>18.7</td>\n",
" <td>Germany</td>\n",
" <td>Aldi, Trader Joe's</td>\n",
" <td>Fashion &amp; Retail</td>\n",
" <td>70-80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>633</th>\n",
" <td>622</td>\n",
" <td>Tony Tamer</td>\n",
" <td>4.6</td>\n",
" <td>United States</td>\n",
" <td>private equity</td>\n",
" <td>Finance &amp; Investments</td>\n",
" <td>60-70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>922</th>\n",
" <td>913</td>\n",
" <td>Bob Gaglardi</td>\n",
" <td>3.3</td>\n",
" <td>Canada</td>\n",
" <td>hotels</td>\n",
" <td>Real Estate</td>\n",
" <td>80+</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2178</th>\n",
" <td>2076</td>\n",
" <td>Eugene Wu</td>\n",
" <td>1.4</td>\n",
" <td>Taiwan</td>\n",
" <td>finance</td>\n",
" <td>Finance &amp; Investments</td>\n",
" <td>70-80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>415</th>\n",
" <td>411</td>\n",
" <td>Leonard Stern</td>\n",
" <td>6.2</td>\n",
" <td>United States</td>\n",
" <td>real estate</td>\n",
" <td>Real Estate</td>\n",
" <td>80+</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2080 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" Rank Name Networth Country \\\n",
"1909 1818 Tran Ba Duong & family 1.6 Vietnam \n",
"2099 2076 Mark Dixon 1.4 United Kingdom \n",
"1392 1341 Yingzhuo Xu 2.3 China \n",
"627 622 Bruce Flatt 4.6 Canada \n",
"527 523 Li Liangbin 5.2 China \n",
"... ... ... ... ... \n",
"84 85 Theo Albrecht, Jr. & family 18.7 Germany \n",
"633 622 Tony Tamer 4.6 United States \n",
"922 913 Bob Gaglardi 3.3 Canada \n",
"2178 2076 Eugene Wu 1.4 Taiwan \n",
"415 411 Leonard Stern 6.2 United States \n",
"\n",
" Source Industry Age_category \n",
"1909 automotive Automotive 60-70 \n",
"2099 office real estate Real Estate 60-70 \n",
"1392 agribusiness Food & Beverage 50-60 \n",
"627 money management Finance & Investments 50-60 \n",
"527 lithium Manufacturing 50-60 \n",
"... ... ... ... \n",
"84 Aldi, Trader Joe's Fashion & Retail 70-80 \n",
"633 private equity Finance & Investments 60-70 \n",
"922 hotels Real Estate 80+ \n",
"2178 finance Finance & Investments 70-80 \n",
"415 real estate Real Estate 80+ \n",
"\n",
"[2080 rows x 7 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'y_train'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age_category</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1909</th>\n",
" <td>60-70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2099</th>\n",
" <td>60-70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1392</th>\n",
" <td>50-60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>627</th>\n",
" <td>50-60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>527</th>\n",
" <td>50-60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>84</th>\n",
" <td>70-80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>633</th>\n",
" <td>60-70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>922</th>\n",
" <td>80+</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2178</th>\n",
" <td>70-80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>415</th>\n",
" <td>80+</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2080 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" Age_category\n",
"1909 60-70\n",
"2099 60-70\n",
"1392 50-60\n",
"627 50-60\n",
"527 50-60\n",
"... ...\n",
"84 70-80\n",
"633 60-70\n",
"922 80+\n",
"2178 70-80\n",
"415 80+\n",
"\n",
"[2080 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'X_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Rank</th>\n",
" <th>Name</th>\n",
" <th>Networth</th>\n",
" <th>Country</th>\n",
" <th>Source</th>\n",
" <th>Industry</th>\n",
" <th>Age_category</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2075</th>\n",
" <td>2076</td>\n",
" <td>Radhe Shyam Agarwal</td>\n",
" <td>1.4</td>\n",
" <td>India</td>\n",
" <td>consumer goods</td>\n",
" <td>Fashion &amp; Retail</td>\n",
" <td>70-80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1529</th>\n",
" <td>1513</td>\n",
" <td>Robert Duggan</td>\n",
" <td>2.0</td>\n",
" <td>United States</td>\n",
" <td>pharmaceuticals</td>\n",
" <td>Healthcare</td>\n",
" <td>70-80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1803</th>\n",
" <td>1729</td>\n",
" <td>Yao Kuizhang</td>\n",
" <td>1.7</td>\n",
" <td>China</td>\n",
" <td>beverages</td>\n",
" <td>Food &amp; Beverage</td>\n",
" <td>50-60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>425</th>\n",
" <td>424</td>\n",
" <td>Alexei Kuzmichev</td>\n",
" <td>6.0</td>\n",
" <td>Russia</td>\n",
" <td>oil, banking, telecom</td>\n",
" <td>Energy</td>\n",
" <td>50-60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2597</th>\n",
" <td>2578</td>\n",
" <td>Ramesh Genomal</td>\n",
" <td>1.0</td>\n",
" <td>Philippines</td>\n",
" <td>apparel</td>\n",
" <td>Fashion &amp; Retail</td>\n",
" <td>70-80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>935</th>\n",
" <td>913</td>\n",
" <td>Alfred Oetker</td>\n",
" <td>3.3</td>\n",
" <td>Germany</td>\n",
" <td>consumer goods</td>\n",
" <td>Fashion &amp; Retail</td>\n",
" <td>50-60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1541</th>\n",
" <td>1513</td>\n",
" <td>Thomas Lee</td>\n",
" <td>2.0</td>\n",
" <td>United States</td>\n",
" <td>private equity</td>\n",
" <td>Finance &amp; Investments</td>\n",
" <td>70-80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1646</th>\n",
" <td>1645</td>\n",
" <td>Roberto Angelini Rossi</td>\n",
" <td>1.8</td>\n",
" <td>Chile</td>\n",
" <td>forestry, mining</td>\n",
" <td>diversified</td>\n",
" <td>70-80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>376</th>\n",
" <td>375</td>\n",
" <td>Patrick Drahi</td>\n",
" <td>6.6</td>\n",
" <td>France</td>\n",
" <td>telecom</td>\n",
" <td>Telecom</td>\n",
" <td>50-60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1894</th>\n",
" <td>1818</td>\n",
" <td>Gerald Schwartz</td>\n",
" <td>1.6</td>\n",
" <td>Canada</td>\n",
" <td>finance</td>\n",
" <td>Finance &amp; Investments</td>\n",
" <td>80+</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>520 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" Rank Name Networth Country \\\n",
"2075 2076 Radhe Shyam Agarwal 1.4 India \n",
"1529 1513 Robert Duggan 2.0 United States \n",
"1803 1729 Yao Kuizhang 1.7 China \n",
"425 424 Alexei Kuzmichev 6.0 Russia \n",
"2597 2578 Ramesh Genomal 1.0 Philippines \n",
"... ... ... ... ... \n",
"935 913 Alfred Oetker 3.3 Germany \n",
"1541 1513 Thomas Lee 2.0 United States \n",
"1646 1645 Roberto Angelini Rossi 1.8 Chile \n",
"376 375 Patrick Drahi 6.6 France \n",
"1894 1818 Gerald Schwartz 1.6 Canada \n",
"\n",
" Source Industry Age_category \n",
"2075 consumer goods Fashion & Retail 70-80 \n",
"1529 pharmaceuticals Healthcare 70-80 \n",
"1803 beverages Food & Beverage 50-60 \n",
"425 oil, banking, telecom Energy 50-60 \n",
"2597 apparel Fashion & Retail 70-80 \n",
"... ... ... ... \n",
"935 consumer goods Fashion & Retail 50-60 \n",
"1541 private equity Finance & Investments 70-80 \n",
"1646 forestry, mining diversified 70-80 \n",
"376 telecom Telecom 50-60 \n",
"1894 finance Finance & Investments 80+ \n",
"\n",
"[520 rows x 7 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'y_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age_category</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2075</th>\n",
" <td>70-80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1529</th>\n",
" <td>70-80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1803</th>\n",
" <td>50-60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>425</th>\n",
" <td>50-60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2597</th>\n",
" <td>70-80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>935</th>\n",
" <td>50-60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1541</th>\n",
" <td>70-80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1646</th>\n",
" <td>70-80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>376</th>\n",
" <td>50-60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1894</th>\n",
" <td>80+</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>520 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" Age_category\n",
"2075 70-80\n",
"1529 70-80\n",
"1803 50-60\n",
"425 50-60\n",
"2597 70-80\n",
"... ...\n",
"935 50-60\n",
"1541 70-80\n",
"1646 70-80\n",
"376 50-60\n",
"1894 80+\n",
"\n",
"[520 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
} }
], ],
"source": [ "source": [
"from typing import Tuple\n", "from utils import split_stratified_into_train_val_test\n",
"import pandas as pd\n",
"from pandas import DataFrame\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"def split_stratified_into_train_val_test(\n",
" df_input,\n",
" stratify_colname=\"y\",\n",
" frac_train=0.6,\n",
" frac_val=0.15,\n",
" frac_test=0.25,\n",
" random_state=None,\n",
") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
" \n",
" if frac_train + frac_val + frac_test != 1.0:\n",
" raise ValueError(\n",
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
" % (frac_train, frac_val, frac_test)\n",
" )\n",
" if stratify_colname not in df_input.columns:\n",
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
" X = df_input # Contains all columns.\n",
" y = df_input[\n",
" [stratify_colname]\n",
" ] # Dataframe of just the column on which to stratify.\n",
" # Split original dataframe into train and temp dataframes.\n",
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
" )\n",
" if frac_val <= 0:\n",
" assert len(df_input) == len(df_train) + len(df_temp)\n",
" return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
" # Split the temp dataframe into val and test dataframes.\n",
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
" df_val, df_test, y_val, y_test = train_test_split(\n",
" df_temp,\n",
" y_temp,\n",
" stratify=y_temp,\n",
" test_size=relative_frac_test,\n",
" random_state=random_state,\n",
" )\n",
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
" return df_train, df_val, df_test, y_train, y_val, y_test\n",
"\n", "\n",
"X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n", "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
" df, stratify_colname=\"Age\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=None\n", " df, stratify_colname=\"Age_category\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=9\n",
")\n", ")\n",
"\n", "\n",
"display(\"X_train\", X_train)\n", "display(\"X_train\", X_train)\n",
@ -214,6 +723,135 @@
"display(\"X_test\", X_test)\n", "display(\"X_test\", X_test)\n",
"display(\"y_test\", y_test)" "display(\"y_test\", y_test)"
] ]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Формирование конвейера для классификации данных\n",
"## preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n",
"## preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n",
"## features_preprocessing -- трансформер для предобработки признаков\n",
"## features_engineering -- трансформер для конструирования признаков\n",
"## drop_columns -- трансформер для удаления колонок\n",
"## pipeline_end -- основной конвейер предобработки данных и конструирования признаков"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.discriminant_analysis import StandardScaler\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"\n",
"columns_to_drop = [\"Rank \", \"Name\"]\n",
"num_columns = [\n",
" column\n",
" for column in df.columns\n",
" if column not in columns_to_drop and df[column].dtype != \"object\"\n",
"]\n",
"cat_columns = [\n",
" column\n",
" for column in df.columns\n",
" if column not in columns_to_drop and df[column].dtype == \"object\"\n",
"]\n",
"\n",
"num_imputer = SimpleImputer(strategy=\"median\")\n",
"num_scaler = StandardScaler()\n",
"preprocessing_num = Pipeline(\n",
" [\n",
" (\"imputer\", num_imputer),\n",
" (\"scaler\", num_scaler),\n",
" ]\n",
")\n",
"\n",
"cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n",
"cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
"preprocessing_cat = Pipeline(\n",
" [\n",
" (\"imputer\", cat_imputer),\n",
" (\"encoder\", cat_encoder),\n",
" ]\n",
")\n",
"\n",
"features_preprocessing = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"prepocessing_num\", preprocessing_num, num_columns),\n",
" (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
" (\"prepocessing_features\", cat_imputer),\n",
" ],\n",
" remainder=\"passthrough\"\n",
")\n",
"\n",
"\n",
"drop_columns = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"drop_columns\", \"drop\", columns_to_drop),\n",
" ],\n",
" remainder=\"passthrough\",\n",
")\n",
"\n",
"features_postprocessing = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"prepocessing_cat\", preprocessing_cat, [\"Cabin_type\"]),\n",
" ],\n",
" remainder=\"passthrough\",\n",
")\n",
"\n",
"pipeline_end = Pipeline(\n",
" [\n",
" (\"features_preprocessing\", features_preprocessing),\n",
" (\"drop_columns\", drop_columns),\n",
" (\"features_postprocessing\", features_postprocessing),\n",
" ]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "not enough values to unpack (expected 3, got 2)",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[31], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m preprocessing_result \u001b[38;5;241m=\u001b[39m \u001b[43mpipeline_end\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m preprocessed_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(\n\u001b[0;32m 3\u001b[0m preprocessing_result,\n\u001b[0;32m 4\u001b[0m columns\u001b[38;5;241m=\u001b[39mpipeline_end\u001b[38;5;241m.\u001b[39mget_feature_names_out(),\n\u001b[0;32m 5\u001b[0m )\n\u001b[0;32m 7\u001b[0m preprocessed_df\n",
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py:533\u001b[0m, in \u001b[0;36mPipeline.fit_transform\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 490\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Fit the model and transform with the final estimator.\u001b[39;00m\n\u001b[0;32m 491\u001b[0m \n\u001b[0;32m 492\u001b[0m \u001b[38;5;124;03mFit all the transformers one after the other and sequentially transform\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 530\u001b[0m \u001b[38;5;124;03m Transformed samples.\u001b[39;00m\n\u001b[0;32m 531\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 532\u001b[0m routed_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_method_params(method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m, props\u001b[38;5;241m=\u001b[39mparams)\n\u001b[1;32m--> 533\u001b[0m Xt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrouted_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 535\u001b[0m last_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_final_estimator\n\u001b[0;32m 536\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_log_message(\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m)):\n",
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py:406\u001b[0m, in \u001b[0;36mPipeline._fit\u001b[1;34m(self, X, y, routed_params)\u001b[0m\n\u001b[0;32m 404\u001b[0m cloned_transformer \u001b[38;5;241m=\u001b[39m clone(transformer)\n\u001b[0;32m 405\u001b[0m \u001b[38;5;66;03m# Fit or load from cache the current transformer\u001b[39;00m\n\u001b[1;32m--> 406\u001b[0m X, fitted_transformer \u001b[38;5;241m=\u001b[39m \u001b[43mfit_transform_one_cached\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 407\u001b[0m \u001b[43m \u001b[49m\u001b[43mcloned_transformer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 408\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 409\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 410\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 411\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage_clsname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPipeline\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 412\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_message\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstep_idx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 413\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrouted_params\u001b[49m\u001b[43m[\u001b[49m\u001b[43mname\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 414\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 415\u001b[0m \u001b[38;5;66;03m# Replace the transformer of the step with the fitted\u001b[39;00m\n\u001b[0;32m 416\u001b[0m \u001b[38;5;66;03m# transformer. This is necessary when loading the transformer\u001b[39;00m\n\u001b[0;32m 417\u001b[0m \u001b[38;5;66;03m# from the cache.\u001b[39;00m\n\u001b[0;32m 418\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps[step_idx] \u001b[38;5;241m=\u001b[39m (name, fitted_transformer)\n",
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\joblib\\memory.py:312\u001b[0m, in \u001b[0;36mNotMemorizedFunc.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 311\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 312\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py:1310\u001b[0m, in \u001b[0;36m_fit_transform_one\u001b[1;34m(transformer, X, y, weight, message_clsname, message, params)\u001b[0m\n\u001b[0;32m 1308\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(message_clsname, message):\n\u001b[0;32m 1309\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(transformer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m-> 1310\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mtransformer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfit_transform\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1311\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1312\u001b[0m res \u001b[38;5;241m=\u001b[39m transformer\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit\u001b[39m\u001b[38;5;124m\"\u001b[39m, {}))\u001b[38;5;241m.\u001b[39mtransform(\n\u001b[0;32m 1313\u001b[0m X, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtransform\u001b[39m\u001b[38;5;124m\"\u001b[39m, {})\n\u001b[0;32m 1314\u001b[0m )\n",
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\utils\\_set_output.py:316\u001b[0m, in \u001b[0;36m_wrap_method_output.<locals>.wrapped\u001b[1;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[0;32m 314\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[0;32m 315\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 316\u001b[0m data_to_wrap \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[0;32m 318\u001b[0m \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[0;32m 319\u001b[0m return_tuple \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 320\u001b[0m _wrap_data_with_container(method, data_to_wrap[\u001b[38;5;241m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[0;32m 321\u001b[0m \u001b[38;5;241m*\u001b[39mdata_to_wrap[\u001b[38;5;241m1\u001b[39m:],\n\u001b[0;32m 322\u001b[0m )\n",
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:965\u001b[0m, in \u001b[0;36mColumnTransformer.fit_transform\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 963\u001b[0m \u001b[38;5;66;03m# set n_features_in_ attribute\u001b[39;00m\n\u001b[0;32m 964\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_n_features(X, reset\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m--> 965\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_transformers\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 966\u001b[0m n_samples \u001b[38;5;241m=\u001b[39m _num_samples(X)\n\u001b[0;32m 968\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_column_callables(X)\n",
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:501\u001b[0m, in \u001b[0;36mColumnTransformer._validate_transformers\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformers:\n\u001b[0;32m 499\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m--> 501\u001b[0m names, transformers, _ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mzip\u001b[39m(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformers)\n\u001b[0;32m 503\u001b[0m \u001b[38;5;66;03m# validate names\u001b[39;00m\n\u001b[0;32m 504\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_names(names)\n",
"\u001b[1;31mValueError\u001b[0m: not enough values to unpack (expected 3, got 2)"
]
}
],
"source": [
"preprocessing_result = pipeline_end.fit_transform(X_train)\n",
"preprocessed_df = pd.DataFrame(\n",
" preprocessing_result,\n",
" columns=pipeline_end.get_feature_names_out(),\n",
")\n",
"\n",
"preprocessed_df"
]
} }
], ],
"metadata": { "metadata": {