коммит2
This commit is contained in:
parent
41c4ab91ed
commit
8fc280c7e7
882
lab_4/lab4.ipynb
882
lab_4/lab4.ipynb
@ -33,7 +33,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Подготовим данные: удалим колонки rank и name(в них уникальные значения, которые не участвуют в предсказаниях). А также преобразуем номинальные колонки в числовые(country, source, industry) и категоризируем колонку age"
|
||||
"# Подготовим данные: категоризируем колонку age"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -45,167 +45,676 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Networth Age Country Source Industry\n",
|
||||
"0 219.0 50 United States Tesla, SpaceX Automotive \n",
|
||||
"1 171.0 58 United States Amazon Technology \n",
|
||||
"2 158.0 73 France LVMH Fashion & Retail \n",
|
||||
"3 129.0 66 United States Microsoft Technology \n",
|
||||
"4 118.0 91 United States Berkshire Hathaway Finance & Investments \n"
|
||||
"Rank 0\n",
|
||||
"Name 0\n",
|
||||
"Networth 0\n",
|
||||
"Age 0\n",
|
||||
"Country 0\n",
|
||||
"Source 0\n",
|
||||
"Industry 0\n",
|
||||
"dtype: int64\n",
|
||||
"\n",
|
||||
"Rank False\n",
|
||||
"Name False\n",
|
||||
"Networth False\n",
|
||||
"Age False\n",
|
||||
"Country False\n",
|
||||
"Source False\n",
|
||||
"Industry False\n",
|
||||
"dtype: bool\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Удаление колонок 'rank' и 'name'\n",
|
||||
"df.drop(columns=['Rank ', 'Name'], inplace=True)\n",
|
||||
"print(df.isnull().sum())\n",
|
||||
"\n",
|
||||
"# Проверка, что колонки были удалены\n",
|
||||
"print(df.head())"
|
||||
"print()\n",
|
||||
"\n",
|
||||
"# Есть ли пустые значения признаков\n",
|
||||
"print(df.isnull().any())\n",
|
||||
"\n",
|
||||
"print()\n",
|
||||
"\n",
|
||||
"# Процент пустых значений признаков\n",
|
||||
"for i in df.columns:\n",
|
||||
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
||||
" if null_rate > 0:\n",
|
||||
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Networth Country_Argentina Country_Australia Country_Austria \\\n",
|
||||
"0 219.0 False False False \n",
|
||||
"1 171.0 False False False \n",
|
||||
"2 158.0 False False False \n",
|
||||
"3 129.0 False False False \n",
|
||||
"4 118.0 False False False \n",
|
||||
" Rank Name Networth Country \\\n",
|
||||
"0 1 Elon Musk 219.0 United States \n",
|
||||
"1 2 Jeff Bezos 171.0 United States \n",
|
||||
"2 3 Bernard Arnault & family 158.0 France \n",
|
||||
"3 4 Bill Gates 129.0 United States \n",
|
||||
"4 5 Warren Buffett 118.0 United States \n",
|
||||
"\n",
|
||||
" Country_Barbados Country_Belgium Country_Belize Country_Brazil \\\n",
|
||||
"0 False False False False \n",
|
||||
"1 False False False False \n",
|
||||
"2 False False False False \n",
|
||||
"3 False False False False \n",
|
||||
"4 False False False False \n",
|
||||
"\n",
|
||||
" Country_Bulgaria Country_Canada ... wind wine winter wire wireless \\\n",
|
||||
"0 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"1 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"2 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"3 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"4 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"\n",
|
||||
" yahoo yogurt zara zoom Age \n",
|
||||
"0 0.0 0.0 0.0 0.0 50-60 \n",
|
||||
"1 0.0 0.0 0.0 0.0 50-60 \n",
|
||||
"2 0.0 0.0 0.0 0.0 70-80 \n",
|
||||
"3 0.0 0.0 0.0 0.0 60-70 \n",
|
||||
"4 0.0 0.0 0.0 0.0 80+ \n",
|
||||
"\n",
|
||||
"[5 rows x 828 columns]\n"
|
||||
" Source Industry Age_category \n",
|
||||
"0 Tesla, SpaceX Automotive 50-60 \n",
|
||||
"1 Amazon Technology 50-60 \n",
|
||||
"2 LVMH Fashion & Retail 70-80 \n",
|
||||
"3 Microsoft Technology 60-70 \n",
|
||||
"4 Berkshire Hathaway Finance & Investments 80+ \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"\n",
|
||||
"# Преобразуем 'country' и 'industry' в бинарные матрицы с помощью One-Hot Encoding\n",
|
||||
"df_country = pd.get_dummies(df[['Country']], drop_first=True)\n",
|
||||
"df_industry = pd.get_dummies(df[['Industry']], drop_first=True)\n",
|
||||
"\n",
|
||||
"# Преобразуем колонку 'source' с помощью TF-IDF\n",
|
||||
"tfidf_vectorizer = TfidfVectorizer(max_features=1000) \n",
|
||||
"X_tfidf = tfidf_vectorizer.fit_transform(df['Source']).toarray()\n",
|
||||
"\n",
|
||||
"# Создаем DataFrame с результатами TF-IDF\n",
|
||||
"df_source_tfidf = pd.DataFrame(X_tfidf, columns=tfidf_vectorizer.get_feature_names_out())\n",
|
||||
"\n",
|
||||
"bins = [0, 30, 40, 50, 60, 70, 80, 100] # границы для возрастных категорий\n",
|
||||
"bins = [0, 30, 40, 50, 60, 70, 80, 101] # границы для возрастных категорий\n",
|
||||
"labels = ['Under 30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'] # метки для категорий\n",
|
||||
"\n",
|
||||
"# Создаем новую колонку 'age_group', где будет храниться категория\n",
|
||||
"df_age_group = pd.cut(df['Age'], bins=bins, labels=labels, right=False)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"df[\"Age_category\"] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)\n",
|
||||
"# Удаляем оригинальные колонки 'country', 'industry' и 'source' из исходного DataFrame\n",
|
||||
"df.drop(columns=['Country', 'Industry', 'Source', 'Age'], inplace=True)\n",
|
||||
"\n",
|
||||
"# Объединяем все преобразованные данные в один DataFrame\n",
|
||||
"df_transformed = pd.concat([df, df_country, df_industry, df_source_tfidf, df_age_group], axis=1)\n",
|
||||
"df.drop(columns=['Age'], inplace=True)\n",
|
||||
"\n",
|
||||
"# Просмотр результата\n",
|
||||
"print(df_transformed.head())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Разобьём набор данных на обучающую и тестовые выборки (80/20) для задачи классификации. Целевой признак- Age"
|
||||
"print(df.head())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'df_transformed' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[1;32mIn[18], line 46\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(df_input) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_train) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_val) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_test)\n\u001b[0;32m 43\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test, y_train, y_val, y_test\n\u001b[0;32m 45\u001b[0m X_train, X_val, X_test, y_train, y_val, y_test \u001b[38;5;241m=\u001b[39m split_stratified_into_train_val_test(\n\u001b[1;32m---> 46\u001b[0m \u001b[43mdf_transformed\u001b[49m, stratify_colname\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAge\u001b[39m\u001b[38;5;124m\"\u001b[39m, frac_train\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.80\u001b[39m, frac_val\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m, frac_test\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.20\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 47\u001b[0m )\n\u001b[0;32m 49\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, X_train)\n\u001b[0;32m 50\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, y_train)\n",
|
||||
"\u001b[1;31mNameError\u001b[0m: name 'df_transformed' is not defined"
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'X_train'"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Rank</th>\n",
|
||||
" <th>Name</th>\n",
|
||||
" <th>Networth</th>\n",
|
||||
" <th>Country</th>\n",
|
||||
" <th>Source</th>\n",
|
||||
" <th>Industry</th>\n",
|
||||
" <th>Age_category</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>1909</th>\n",
|
||||
" <td>1818</td>\n",
|
||||
" <td>Tran Ba Duong & family</td>\n",
|
||||
" <td>1.6</td>\n",
|
||||
" <td>Vietnam</td>\n",
|
||||
" <td>automotive</td>\n",
|
||||
" <td>Automotive</td>\n",
|
||||
" <td>60-70</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2099</th>\n",
|
||||
" <td>2076</td>\n",
|
||||
" <td>Mark Dixon</td>\n",
|
||||
" <td>1.4</td>\n",
|
||||
" <td>United Kingdom</td>\n",
|
||||
" <td>office real estate</td>\n",
|
||||
" <td>Real Estate</td>\n",
|
||||
" <td>60-70</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1392</th>\n",
|
||||
" <td>1341</td>\n",
|
||||
" <td>Yingzhuo Xu</td>\n",
|
||||
" <td>2.3</td>\n",
|
||||
" <td>China</td>\n",
|
||||
" <td>agribusiness</td>\n",
|
||||
" <td>Food & Beverage</td>\n",
|
||||
" <td>50-60</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>627</th>\n",
|
||||
" <td>622</td>\n",
|
||||
" <td>Bruce Flatt</td>\n",
|
||||
" <td>4.6</td>\n",
|
||||
" <td>Canada</td>\n",
|
||||
" <td>money management</td>\n",
|
||||
" <td>Finance & Investments</td>\n",
|
||||
" <td>50-60</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>527</th>\n",
|
||||
" <td>523</td>\n",
|
||||
" <td>Li Liangbin</td>\n",
|
||||
" <td>5.2</td>\n",
|
||||
" <td>China</td>\n",
|
||||
" <td>lithium</td>\n",
|
||||
" <td>Manufacturing</td>\n",
|
||||
" <td>50-60</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>84</th>\n",
|
||||
" <td>85</td>\n",
|
||||
" <td>Theo Albrecht, Jr. & family</td>\n",
|
||||
" <td>18.7</td>\n",
|
||||
" <td>Germany</td>\n",
|
||||
" <td>Aldi, Trader Joe's</td>\n",
|
||||
" <td>Fashion & Retail</td>\n",
|
||||
" <td>70-80</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>633</th>\n",
|
||||
" <td>622</td>\n",
|
||||
" <td>Tony Tamer</td>\n",
|
||||
" <td>4.6</td>\n",
|
||||
" <td>United States</td>\n",
|
||||
" <td>private equity</td>\n",
|
||||
" <td>Finance & Investments</td>\n",
|
||||
" <td>60-70</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>922</th>\n",
|
||||
" <td>913</td>\n",
|
||||
" <td>Bob Gaglardi</td>\n",
|
||||
" <td>3.3</td>\n",
|
||||
" <td>Canada</td>\n",
|
||||
" <td>hotels</td>\n",
|
||||
" <td>Real Estate</td>\n",
|
||||
" <td>80+</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2178</th>\n",
|
||||
" <td>2076</td>\n",
|
||||
" <td>Eugene Wu</td>\n",
|
||||
" <td>1.4</td>\n",
|
||||
" <td>Taiwan</td>\n",
|
||||
" <td>finance</td>\n",
|
||||
" <td>Finance & Investments</td>\n",
|
||||
" <td>70-80</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>415</th>\n",
|
||||
" <td>411</td>\n",
|
||||
" <td>Leonard Stern</td>\n",
|
||||
" <td>6.2</td>\n",
|
||||
" <td>United States</td>\n",
|
||||
" <td>real estate</td>\n",
|
||||
" <td>Real Estate</td>\n",
|
||||
" <td>80+</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>2080 rows × 7 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Rank Name Networth Country \\\n",
|
||||
"1909 1818 Tran Ba Duong & family 1.6 Vietnam \n",
|
||||
"2099 2076 Mark Dixon 1.4 United Kingdom \n",
|
||||
"1392 1341 Yingzhuo Xu 2.3 China \n",
|
||||
"627 622 Bruce Flatt 4.6 Canada \n",
|
||||
"527 523 Li Liangbin 5.2 China \n",
|
||||
"... ... ... ... ... \n",
|
||||
"84 85 Theo Albrecht, Jr. & family 18.7 Germany \n",
|
||||
"633 622 Tony Tamer 4.6 United States \n",
|
||||
"922 913 Bob Gaglardi 3.3 Canada \n",
|
||||
"2178 2076 Eugene Wu 1.4 Taiwan \n",
|
||||
"415 411 Leonard Stern 6.2 United States \n",
|
||||
"\n",
|
||||
" Source Industry Age_category \n",
|
||||
"1909 automotive Automotive 60-70 \n",
|
||||
"2099 office real estate Real Estate 60-70 \n",
|
||||
"1392 agribusiness Food & Beverage 50-60 \n",
|
||||
"627 money management Finance & Investments 50-60 \n",
|
||||
"527 lithium Manufacturing 50-60 \n",
|
||||
"... ... ... ... \n",
|
||||
"84 Aldi, Trader Joe's Fashion & Retail 70-80 \n",
|
||||
"633 private equity Finance & Investments 60-70 \n",
|
||||
"922 hotels Real Estate 80+ \n",
|
||||
"2178 finance Finance & Investments 70-80 \n",
|
||||
"415 real estate Real Estate 80+ \n",
|
||||
"\n",
|
||||
"[2080 rows x 7 columns]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'y_train'"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Age_category</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>1909</th>\n",
|
||||
" <td>60-70</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2099</th>\n",
|
||||
" <td>60-70</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1392</th>\n",
|
||||
" <td>50-60</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>627</th>\n",
|
||||
" <td>50-60</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>527</th>\n",
|
||||
" <td>50-60</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>84</th>\n",
|
||||
" <td>70-80</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>633</th>\n",
|
||||
" <td>60-70</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>922</th>\n",
|
||||
" <td>80+</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2178</th>\n",
|
||||
" <td>70-80</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>415</th>\n",
|
||||
" <td>80+</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>2080 rows × 1 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Age_category\n",
|
||||
"1909 60-70\n",
|
||||
"2099 60-70\n",
|
||||
"1392 50-60\n",
|
||||
"627 50-60\n",
|
||||
"527 50-60\n",
|
||||
"... ...\n",
|
||||
"84 70-80\n",
|
||||
"633 60-70\n",
|
||||
"922 80+\n",
|
||||
"2178 70-80\n",
|
||||
"415 80+\n",
|
||||
"\n",
|
||||
"[2080 rows x 1 columns]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'X_test'"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Rank</th>\n",
|
||||
" <th>Name</th>\n",
|
||||
" <th>Networth</th>\n",
|
||||
" <th>Country</th>\n",
|
||||
" <th>Source</th>\n",
|
||||
" <th>Industry</th>\n",
|
||||
" <th>Age_category</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>2075</th>\n",
|
||||
" <td>2076</td>\n",
|
||||
" <td>Radhe Shyam Agarwal</td>\n",
|
||||
" <td>1.4</td>\n",
|
||||
" <td>India</td>\n",
|
||||
" <td>consumer goods</td>\n",
|
||||
" <td>Fashion & Retail</td>\n",
|
||||
" <td>70-80</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1529</th>\n",
|
||||
" <td>1513</td>\n",
|
||||
" <td>Robert Duggan</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>United States</td>\n",
|
||||
" <td>pharmaceuticals</td>\n",
|
||||
" <td>Healthcare</td>\n",
|
||||
" <td>70-80</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1803</th>\n",
|
||||
" <td>1729</td>\n",
|
||||
" <td>Yao Kuizhang</td>\n",
|
||||
" <td>1.7</td>\n",
|
||||
" <td>China</td>\n",
|
||||
" <td>beverages</td>\n",
|
||||
" <td>Food & Beverage</td>\n",
|
||||
" <td>50-60</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>425</th>\n",
|
||||
" <td>424</td>\n",
|
||||
" <td>Alexei Kuzmichev</td>\n",
|
||||
" <td>6.0</td>\n",
|
||||
" <td>Russia</td>\n",
|
||||
" <td>oil, banking, telecom</td>\n",
|
||||
" <td>Energy</td>\n",
|
||||
" <td>50-60</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2597</th>\n",
|
||||
" <td>2578</td>\n",
|
||||
" <td>Ramesh Genomal</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>Philippines</td>\n",
|
||||
" <td>apparel</td>\n",
|
||||
" <td>Fashion & Retail</td>\n",
|
||||
" <td>70-80</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>935</th>\n",
|
||||
" <td>913</td>\n",
|
||||
" <td>Alfred Oetker</td>\n",
|
||||
" <td>3.3</td>\n",
|
||||
" <td>Germany</td>\n",
|
||||
" <td>consumer goods</td>\n",
|
||||
" <td>Fashion & Retail</td>\n",
|
||||
" <td>50-60</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1541</th>\n",
|
||||
" <td>1513</td>\n",
|
||||
" <td>Thomas Lee</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>United States</td>\n",
|
||||
" <td>private equity</td>\n",
|
||||
" <td>Finance & Investments</td>\n",
|
||||
" <td>70-80</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1646</th>\n",
|
||||
" <td>1645</td>\n",
|
||||
" <td>Roberto Angelini Rossi</td>\n",
|
||||
" <td>1.8</td>\n",
|
||||
" <td>Chile</td>\n",
|
||||
" <td>forestry, mining</td>\n",
|
||||
" <td>diversified</td>\n",
|
||||
" <td>70-80</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>376</th>\n",
|
||||
" <td>375</td>\n",
|
||||
" <td>Patrick Drahi</td>\n",
|
||||
" <td>6.6</td>\n",
|
||||
" <td>France</td>\n",
|
||||
" <td>telecom</td>\n",
|
||||
" <td>Telecom</td>\n",
|
||||
" <td>50-60</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1894</th>\n",
|
||||
" <td>1818</td>\n",
|
||||
" <td>Gerald Schwartz</td>\n",
|
||||
" <td>1.6</td>\n",
|
||||
" <td>Canada</td>\n",
|
||||
" <td>finance</td>\n",
|
||||
" <td>Finance & Investments</td>\n",
|
||||
" <td>80+</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>520 rows × 7 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Rank Name Networth Country \\\n",
|
||||
"2075 2076 Radhe Shyam Agarwal 1.4 India \n",
|
||||
"1529 1513 Robert Duggan 2.0 United States \n",
|
||||
"1803 1729 Yao Kuizhang 1.7 China \n",
|
||||
"425 424 Alexei Kuzmichev 6.0 Russia \n",
|
||||
"2597 2578 Ramesh Genomal 1.0 Philippines \n",
|
||||
"... ... ... ... ... \n",
|
||||
"935 913 Alfred Oetker 3.3 Germany \n",
|
||||
"1541 1513 Thomas Lee 2.0 United States \n",
|
||||
"1646 1645 Roberto Angelini Rossi 1.8 Chile \n",
|
||||
"376 375 Patrick Drahi 6.6 France \n",
|
||||
"1894 1818 Gerald Schwartz 1.6 Canada \n",
|
||||
"\n",
|
||||
" Source Industry Age_category \n",
|
||||
"2075 consumer goods Fashion & Retail 70-80 \n",
|
||||
"1529 pharmaceuticals Healthcare 70-80 \n",
|
||||
"1803 beverages Food & Beverage 50-60 \n",
|
||||
"425 oil, banking, telecom Energy 50-60 \n",
|
||||
"2597 apparel Fashion & Retail 70-80 \n",
|
||||
"... ... ... ... \n",
|
||||
"935 consumer goods Fashion & Retail 50-60 \n",
|
||||
"1541 private equity Finance & Investments 70-80 \n",
|
||||
"1646 forestry, mining diversified 70-80 \n",
|
||||
"376 telecom Telecom 50-60 \n",
|
||||
"1894 finance Finance & Investments 80+ \n",
|
||||
"\n",
|
||||
"[520 rows x 7 columns]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'y_test'"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Age_category</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>2075</th>\n",
|
||||
" <td>70-80</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1529</th>\n",
|
||||
" <td>70-80</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1803</th>\n",
|
||||
" <td>50-60</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>425</th>\n",
|
||||
" <td>50-60</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2597</th>\n",
|
||||
" <td>70-80</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>935</th>\n",
|
||||
" <td>50-60</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1541</th>\n",
|
||||
" <td>70-80</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1646</th>\n",
|
||||
" <td>70-80</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>376</th>\n",
|
||||
" <td>50-60</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1894</th>\n",
|
||||
" <td>80+</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>520 rows × 1 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Age_category\n",
|
||||
"2075 70-80\n",
|
||||
"1529 70-80\n",
|
||||
"1803 50-60\n",
|
||||
"425 50-60\n",
|
||||
"2597 70-80\n",
|
||||
"... ...\n",
|
||||
"935 50-60\n",
|
||||
"1541 70-80\n",
|
||||
"1646 70-80\n",
|
||||
"376 50-60\n",
|
||||
"1894 80+\n",
|
||||
"\n",
|
||||
"[520 rows x 1 columns]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from typing import Tuple\n",
|
||||
"import pandas as pd\n",
|
||||
"from pandas import DataFrame\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"def split_stratified_into_train_val_test(\n",
|
||||
" df_input,\n",
|
||||
" stratify_colname=\"y\",\n",
|
||||
" frac_train=0.6,\n",
|
||||
" frac_val=0.15,\n",
|
||||
" frac_test=0.25,\n",
|
||||
" random_state=None,\n",
|
||||
") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
|
||||
" \n",
|
||||
" if frac_train + frac_val + frac_test != 1.0:\n",
|
||||
" raise ValueError(\n",
|
||||
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
||||
" % (frac_train, frac_val, frac_test)\n",
|
||||
" )\n",
|
||||
" if stratify_colname not in df_input.columns:\n",
|
||||
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
||||
" X = df_input # Contains all columns.\n",
|
||||
" y = df_input[\n",
|
||||
" [stratify_colname]\n",
|
||||
" ] # Dataframe of just the column on which to stratify.\n",
|
||||
" # Split original dataframe into train and temp dataframes.\n",
|
||||
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
||||
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
||||
" )\n",
|
||||
" if frac_val <= 0:\n",
|
||||
" assert len(df_input) == len(df_train) + len(df_temp)\n",
|
||||
" return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
|
||||
" # Split the temp dataframe into val and test dataframes.\n",
|
||||
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
||||
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
||||
" df_temp,\n",
|
||||
" y_temp,\n",
|
||||
" stratify=y_temp,\n",
|
||||
" test_size=relative_frac_test,\n",
|
||||
" random_state=random_state,\n",
|
||||
" )\n",
|
||||
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
||||
" return df_train, df_val, df_test, y_train, y_val, y_test\n",
|
||||
"from utils import split_stratified_into_train_val_test\n",
|
||||
"\n",
|
||||
"X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
|
||||
" df, stratify_colname=\"Age\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=None\n",
|
||||
" df, stratify_colname=\"Age_category\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=9\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"display(\"X_train\", X_train)\n",
|
||||
@ -214,6 +723,135 @@
|
||||
"display(\"X_test\", X_test)\n",
|
||||
"display(\"y_test\", y_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Формирование конвейера для классификации данных\n",
|
||||
"## preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n",
|
||||
"## preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n",
|
||||
"## features_preprocessing -- трансформер для предобработки признаков\n",
|
||||
"## features_engineering -- трансформер для конструирования признаков\n",
|
||||
"## drop_columns -- трансформер для удаления колонок\n",
|
||||
"## pipeline_end -- основной конвейер предобработки данных и конструирования признаков"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"from sklearn.compose import ColumnTransformer\n",
|
||||
"from sklearn.discriminant_analysis import StandardScaler\n",
|
||||
"from sklearn.impute import SimpleImputer\n",
|
||||
"from sklearn.pipeline import Pipeline\n",
|
||||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"columns_to_drop = [\"Rank \", \"Name\"]\n",
|
||||
"num_columns = [\n",
|
||||
" column\n",
|
||||
" for column in df.columns\n",
|
||||
" if column not in columns_to_drop and df[column].dtype != \"object\"\n",
|
||||
"]\n",
|
||||
"cat_columns = [\n",
|
||||
" column\n",
|
||||
" for column in df.columns\n",
|
||||
" if column not in columns_to_drop and df[column].dtype == \"object\"\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"num_imputer = SimpleImputer(strategy=\"median\")\n",
|
||||
"num_scaler = StandardScaler()\n",
|
||||
"preprocessing_num = Pipeline(\n",
|
||||
" [\n",
|
||||
" (\"imputer\", num_imputer),\n",
|
||||
" (\"scaler\", num_scaler),\n",
|
||||
" ]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n",
|
||||
"cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
|
||||
"preprocessing_cat = Pipeline(\n",
|
||||
" [\n",
|
||||
" (\"imputer\", cat_imputer),\n",
|
||||
" (\"encoder\", cat_encoder),\n",
|
||||
" ]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"features_preprocessing = ColumnTransformer(\n",
|
||||
" verbose_feature_names_out=False,\n",
|
||||
" transformers=[\n",
|
||||
" (\"prepocessing_num\", preprocessing_num, num_columns),\n",
|
||||
" (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
|
||||
" (\"prepocessing_features\", cat_imputer),\n",
|
||||
" ],\n",
|
||||
" remainder=\"passthrough\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"drop_columns = ColumnTransformer(\n",
|
||||
" verbose_feature_names_out=False,\n",
|
||||
" transformers=[\n",
|
||||
" (\"drop_columns\", \"drop\", columns_to_drop),\n",
|
||||
" ],\n",
|
||||
" remainder=\"passthrough\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"features_postprocessing = ColumnTransformer(\n",
|
||||
" verbose_feature_names_out=False,\n",
|
||||
" transformers=[\n",
|
||||
" (\"prepocessing_cat\", preprocessing_cat, [\"Cabin_type\"]),\n",
|
||||
" ],\n",
|
||||
" remainder=\"passthrough\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"pipeline_end = Pipeline(\n",
|
||||
" [\n",
|
||||
" (\"features_preprocessing\", features_preprocessing),\n",
|
||||
" (\"drop_columns\", drop_columns),\n",
|
||||
" (\"features_postprocessing\", features_postprocessing),\n",
|
||||
" ]\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "ValueError",
|
||||
"evalue": "not enough values to unpack (expected 3, got 2)",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[1;32mIn[31], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m preprocessing_result \u001b[38;5;241m=\u001b[39m \u001b[43mpipeline_end\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m preprocessed_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(\n\u001b[0;32m 3\u001b[0m preprocessing_result,\n\u001b[0;32m 4\u001b[0m columns\u001b[38;5;241m=\u001b[39mpipeline_end\u001b[38;5;241m.\u001b[39mget_feature_names_out(),\n\u001b[0;32m 5\u001b[0m )\n\u001b[0;32m 7\u001b[0m preprocessed_df\n",
|
||||
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py:533\u001b[0m, in \u001b[0;36mPipeline.fit_transform\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 490\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Fit the model and transform with the final estimator.\u001b[39;00m\n\u001b[0;32m 491\u001b[0m \n\u001b[0;32m 492\u001b[0m \u001b[38;5;124;03mFit all the transformers one after the other and sequentially transform\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 530\u001b[0m \u001b[38;5;124;03m Transformed samples.\u001b[39;00m\n\u001b[0;32m 531\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 532\u001b[0m routed_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_method_params(method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m, props\u001b[38;5;241m=\u001b[39mparams)\n\u001b[1;32m--> 533\u001b[0m Xt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrouted_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 535\u001b[0m last_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_final_estimator\n\u001b[0;32m 536\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_log_message(\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m)):\n",
|
||||
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py:406\u001b[0m, in \u001b[0;36mPipeline._fit\u001b[1;34m(self, X, y, routed_params)\u001b[0m\n\u001b[0;32m 404\u001b[0m cloned_transformer \u001b[38;5;241m=\u001b[39m clone(transformer)\n\u001b[0;32m 405\u001b[0m \u001b[38;5;66;03m# Fit or load from cache the current transformer\u001b[39;00m\n\u001b[1;32m--> 406\u001b[0m X, fitted_transformer \u001b[38;5;241m=\u001b[39m \u001b[43mfit_transform_one_cached\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 407\u001b[0m \u001b[43m \u001b[49m\u001b[43mcloned_transformer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 408\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 409\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 410\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 411\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage_clsname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPipeline\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 412\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_message\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstep_idx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 413\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrouted_params\u001b[49m\u001b[43m[\u001b[49m\u001b[43mname\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 414\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 415\u001b[0m \u001b[38;5;66;03m# Replace the transformer of the step with the fitted\u001b[39;00m\n\u001b[0;32m 416\u001b[0m \u001b[38;5;66;03m# transformer. This is necessary when loading the transformer\u001b[39;00m\n\u001b[0;32m 417\u001b[0m \u001b[38;5;66;03m# from the cache.\u001b[39;00m\n\u001b[0;32m 418\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps[step_idx] \u001b[38;5;241m=\u001b[39m (name, fitted_transformer)\n",
|
||||
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\joblib\\memory.py:312\u001b[0m, in \u001b[0;36mNotMemorizedFunc.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 311\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 312\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py:1310\u001b[0m, in \u001b[0;36m_fit_transform_one\u001b[1;34m(transformer, X, y, weight, message_clsname, message, params)\u001b[0m\n\u001b[0;32m 1308\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(message_clsname, message):\n\u001b[0;32m 1309\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(transformer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m-> 1310\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mtransformer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfit_transform\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1311\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1312\u001b[0m res \u001b[38;5;241m=\u001b[39m transformer\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit\u001b[39m\u001b[38;5;124m\"\u001b[39m, {}))\u001b[38;5;241m.\u001b[39mtransform(\n\u001b[0;32m 1313\u001b[0m X, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtransform\u001b[39m\u001b[38;5;124m\"\u001b[39m, {})\n\u001b[0;32m 1314\u001b[0m )\n",
|
||||
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\utils\\_set_output.py:316\u001b[0m, in \u001b[0;36m_wrap_method_output.<locals>.wrapped\u001b[1;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[0;32m 314\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[0;32m 315\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 316\u001b[0m data_to_wrap \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[0;32m 318\u001b[0m \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[0;32m 319\u001b[0m return_tuple \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 320\u001b[0m _wrap_data_with_container(method, data_to_wrap[\u001b[38;5;241m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[0;32m 321\u001b[0m \u001b[38;5;241m*\u001b[39mdata_to_wrap[\u001b[38;5;241m1\u001b[39m:],\n\u001b[0;32m 322\u001b[0m )\n",
|
||||
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:965\u001b[0m, in \u001b[0;36mColumnTransformer.fit_transform\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 963\u001b[0m \u001b[38;5;66;03m# set n_features_in_ attribute\u001b[39;00m\n\u001b[0;32m 964\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_n_features(X, reset\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m--> 965\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_transformers\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 966\u001b[0m n_samples \u001b[38;5;241m=\u001b[39m _num_samples(X)\n\u001b[0;32m 968\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_column_callables(X)\n",
|
||||
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:501\u001b[0m, in \u001b[0;36mColumnTransformer._validate_transformers\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformers:\n\u001b[0;32m 499\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m--> 501\u001b[0m names, transformers, _ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mzip\u001b[39m(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformers)\n\u001b[0;32m 503\u001b[0m \u001b[38;5;66;03m# validate names\u001b[39;00m\n\u001b[0;32m 504\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_names(names)\n",
|
||||
"\u001b[1;31mValueError\u001b[0m: not enough values to unpack (expected 3, got 2)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"preprocessing_result = pipeline_end.fit_transform(X_train)\n",
|
||||
"preprocessed_df = pd.DataFrame(\n",
|
||||
" preprocessing_result,\n",
|
||||
" columns=pipeline_end.get_feature_names_out(),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"preprocessed_df"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
Loading…
Reference in New Issue
Block a user