ещё коммит...

This commit is contained in:
annalyovushkina@yandex.ru 2024-11-29 01:37:09 +04:00
parent 4135ed6c05
commit a0218641ee

View File

@ -2779,7 +2779,7 @@
},
{
"cell_type": "code",
"execution_count": 60,
"execution_count": null,
"metadata": {},
"outputs": [
{
@ -3182,7 +3182,197 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Rank</th>\n",
" <th>Name</th>\n",
" <th>Networth</th>\n",
" <th>Age</th>\n",
" <th>Country</th>\n",
" <th>Source</th>\n",
" <th>Industry</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Elon Musk</td>\n",
" <td>219.0</td>\n",
" <td>50</td>\n",
" <td>United States</td>\n",
" <td>Tesla, SpaceX</td>\n",
" <td>Automotive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Jeff Bezos</td>\n",
" <td>171.0</td>\n",
" <td>58</td>\n",
" <td>United States</td>\n",
" <td>Amazon</td>\n",
" <td>Technology</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Bernard Arnault &amp; family</td>\n",
" <td>158.0</td>\n",
" <td>73</td>\n",
" <td>France</td>\n",
" <td>LVMH</td>\n",
" <td>Fashion &amp; Retail</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Bill Gates</td>\n",
" <td>129.0</td>\n",
" <td>66</td>\n",
" <td>United States</td>\n",
" <td>Microsoft</td>\n",
" <td>Technology</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Warren Buffett</td>\n",
" <td>118.0</td>\n",
" <td>91</td>\n",
" <td>United States</td>\n",
" <td>Berkshire Hathaway</td>\n",
" <td>Finance &amp; Investments</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2595</th>\n",
" <td>2578</td>\n",
" <td>Jorge Gallardo Ballart</td>\n",
" <td>1.0</td>\n",
" <td>80</td>\n",
" <td>Spain</td>\n",
" <td>pharmaceuticals</td>\n",
" <td>Healthcare</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2596</th>\n",
" <td>2578</td>\n",
" <td>Nari Genomal</td>\n",
" <td>1.0</td>\n",
" <td>82</td>\n",
" <td>Philippines</td>\n",
" <td>apparel</td>\n",
" <td>Fashion &amp; Retail</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2597</th>\n",
" <td>2578</td>\n",
" <td>Ramesh Genomal</td>\n",
" <td>1.0</td>\n",
" <td>71</td>\n",
" <td>Philippines</td>\n",
" <td>apparel</td>\n",
" <td>Fashion &amp; Retail</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2598</th>\n",
" <td>2578</td>\n",
" <td>Sunder Genomal</td>\n",
" <td>1.0</td>\n",
" <td>68</td>\n",
" <td>Philippines</td>\n",
" <td>garments</td>\n",
" <td>Fashion &amp; Retail</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2599</th>\n",
" <td>2578</td>\n",
" <td>Horst-Otto Gerberding</td>\n",
" <td>1.0</td>\n",
" <td>69</td>\n",
" <td>Germany</td>\n",
" <td>flavors and fragrances</td>\n",
" <td>Food &amp; Beverage</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2600 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" Rank Name Networth Age Country \\\n",
"0 1 Elon Musk 219.0 50 United States \n",
"1 2 Jeff Bezos 171.0 58 United States \n",
"2 3 Bernard Arnault & family 158.0 73 France \n",
"3 4 Bill Gates 129.0 66 United States \n",
"4 5 Warren Buffett 118.0 91 United States \n",
"... ... ... ... ... ... \n",
"2595 2578 Jorge Gallardo Ballart 1.0 80 Spain \n",
"2596 2578 Nari Genomal 1.0 82 Philippines \n",
"2597 2578 Ramesh Genomal 1.0 71 Philippines \n",
"2598 2578 Sunder Genomal 1.0 68 Philippines \n",
"2599 2578 Horst-Otto Gerberding 1.0 69 Germany \n",
"\n",
" Source Industry \n",
"0 Tesla, SpaceX Automotive \n",
"1 Amazon Technology \n",
"2 LVMH Fashion & Retail \n",
"3 Microsoft Technology \n",
"4 Berkshire Hathaway Finance & Investments \n",
"... ... ... \n",
"2595 pharmaceuticals Healthcare \n",
"2596 apparel Fashion & Retail \n",
"2597 apparel Fashion & Retail \n",
"2598 garments Fashion & Retail \n",
"2599 flavors and fragrances Food & Beverage \n",
"\n",
"[2600 rows x 7 columns]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
@ -3763,7 +3953,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 13,
"metadata": {},
"outputs": [
{
@ -3780,8 +3970,6 @@
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:320: UserWarning: The total space of parameters 1 is smaller than n_iter=10. Running 1 iterations. For exhaustive searches, use GridSearchCV.\n",
" warnings.warn(\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros\n",
" warnings.warn(\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:320: UserWarning: The total space of parameters 9 is smaller than n_iter=10. Running 9 iterations. For exhaustive searches, use GridSearchCV.\n",
" warnings.warn(\n"
]
},
@ -3813,16 +4001,16 @@
"R2: -7135788186375614.0\n",
"\n",
"Model: RandomForestRegressor\n",
"Best Params: {'model__n_estimators': 100, 'model__max_depth': None}\n",
"MAE: 3.372747412240537\n",
"RMSE: 8.304219801175332\n",
"R2: -1.9013866015383956\n",
"Best Params: {'model__n_estimators': 40, 'model__max_depth': 10}\n",
"MAE: 3.454630023161808\n",
"RMSE: 7.755775760541111\n",
"R2: -1.530803448377045\n",
"\n",
"Model: GradientBoostingRegressor\n",
"Best Params: {'model__n_estimators': 200, 'model__max_depth': 5, 'model__learning_rate': 0.2}\n",
"MAE: 3.572597806187309\n",
"RMSE: 10.306842221909957\n",
"R2: -3.4695025074945356\n"
"Best Params: {'model__n_estimators': 100, 'model__max_depth': 4, 'model__learning_rate': 0.4}\n",
"MAE: 3.585784679817764\n",
"RMSE: 10.312249036012052\n",
"R2: -3.474193004771121\n"
]
},
{
@ -3855,13 +4043,13 @@
"param_grids_regression = {\n",
" \"LinearRegression\": {},\n",
" \"RandomForestRegressor\": {\n",
" 'model__n_estimators': [50, 100, 200],\n",
" 'model__max_depth': [None, 10, 20],\n",
" 'model__n_estimators': [10, 20, 30, 40, 50, 100, 150, 200, 250, 500],\n",
" 'model__max_depth': [None, 2, 3, 4, 5, 6, 7, 8, 9 ,10],\n",
" },\n",
" \"GradientBoostingRegressor\": {\n",
" 'model__n_estimators': [50, 100, 200],\n",
" 'model__learning_rate': [0.01, 0.1, 0.2],\n",
" 'model__max_depth': [3, 5, 10]\n",
" 'model__n_estimators': [10, 20, 30, 40, 50, 100, 150, 200, 250, 500],\n",
" 'model__learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5],\n",
" 'model__max_depth': [2, 3, 4, 5, 6, 7, 8, 9 ,10]\n",
" }\n",
"}\n",
"\n",
@ -3905,66 +4093,66 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"#T_c5529_row0_col0, #T_c5529_row0_col1, #T_c5529_row1_col0, #T_c5529_row1_col1 {\n",
"#T_5e893_row0_col0, #T_5e893_row0_col1, #T_5e893_row1_col0, #T_5e893_row1_col1 {\n",
" background-color: #26818e;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_c5529_row0_col2, #T_c5529_row1_col2 {\n",
"#T_5e893_row0_col2, #T_5e893_row1_col2 {\n",
" background-color: #da5a6a;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_c5529_row2_col0, #T_c5529_row2_col1 {\n",
"#T_5e893_row2_col0, #T_5e893_row2_col1 {\n",
" background-color: #a8db34;\n",
" color: #000000;\n",
"}\n",
"#T_c5529_row2_col2 {\n",
"#T_5e893_row2_col2 {\n",
" background-color: #4e02a2;\n",
" color: #f1f1f1;\n",
"}\n",
"</style>\n",
"<table id=\"T_c5529\">\n",
"<table id=\"T_5e893\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"blank level0\" >&nbsp;</th>\n",
" <th id=\"T_c5529_level0_col0\" class=\"col_heading level0 col0\" >MAE</th>\n",
" <th id=\"T_c5529_level0_col1\" class=\"col_heading level0 col1\" >RMSE</th>\n",
" <th id=\"T_c5529_level0_col2\" class=\"col_heading level0 col2\" >R2</th>\n",
" <th id=\"T_5e893_level0_col0\" class=\"col_heading level0 col0\" >MAE</th>\n",
" <th id=\"T_5e893_level0_col1\" class=\"col_heading level0 col1\" >RMSE</th>\n",
" <th id=\"T_5e893_level0_col2\" class=\"col_heading level0 col2\" >R2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th id=\"T_c5529_level0_row0\" class=\"row_heading level0 row0\" >RandomForestRegressor</th>\n",
" <td id=\"T_c5529_row0_col0\" class=\"data row0 col0\" >3.372747</td>\n",
" <td id=\"T_c5529_row0_col1\" class=\"data row0 col1\" >8.304220</td>\n",
" <td id=\"T_c5529_row0_col2\" class=\"data row0 col2\" >-1.901387</td>\n",
" <th id=\"T_5e893_level0_row0\" class=\"row_heading level0 row0\" >RandomForestRegressor</th>\n",
" <td id=\"T_5e893_row0_col0\" class=\"data row0 col0\" >3.454630</td>\n",
" <td id=\"T_5e893_row0_col1\" class=\"data row0 col1\" >7.755776</td>\n",
" <td id=\"T_5e893_row0_col2\" class=\"data row0 col2\" >-1.530803</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_c5529_level0_row1\" class=\"row_heading level0 row1\" >GradientBoostingRegressor</th>\n",
" <td id=\"T_c5529_row1_col0\" class=\"data row1 col0\" >3.572598</td>\n",
" <td id=\"T_c5529_row1_col1\" class=\"data row1 col1\" >10.306842</td>\n",
" <td id=\"T_c5529_row1_col2\" class=\"data row1 col2\" >-3.469503</td>\n",
" <th id=\"T_5e893_level0_row1\" class=\"row_heading level0 row1\" >GradientBoostingRegressor</th>\n",
" <td id=\"T_5e893_row1_col0\" class=\"data row1 col0\" >3.585785</td>\n",
" <td id=\"T_5e893_row1_col1\" class=\"data row1 col1\" >10.312249</td>\n",
" <td id=\"T_5e893_row1_col2\" class=\"data row1 col2\" >-3.474193</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_c5529_level0_row2\" class=\"row_heading level0 row2\" >LinearRegression</th>\n",
" <td id=\"T_c5529_row2_col0\" class=\"data row2 col0\" >18059903.801767</td>\n",
" <td id=\"T_c5529_row2_col1\" class=\"data row2 col1\" >411829080.658451</td>\n",
" <td id=\"T_c5529_row2_col2\" class=\"data row2 col2\" >-7135788186375614.000000</td>\n",
" <th id=\"T_5e893_level0_row2\" class=\"row_heading level0 row2\" >LinearRegression</th>\n",
" <td id=\"T_5e893_row2_col0\" class=\"data row2 col0\" >18059903.801767</td>\n",
" <td id=\"T_5e893_row2_col1\" class=\"data row2 col1\" >411829080.658451</td>\n",
" <td id=\"T_5e893_row2_col2\" class=\"data row2 col2\" >-7135788186375614.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x182a6929490>"
"<pandas.io.formats.style.Styler at 0x182a6043ef0>"
]
},
"execution_count": 9,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@ -3987,6 +4175,174 @@
"\n",
"styled_metrics"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Шикарный вывод: по стране, возрасту, сфере деятельности и источнику доходов невозможно предсказать состояние человека. Значит ли это, что кто угодно, где угодно, и в чём угодно может добиться успеха?"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Классификация"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Rank Name Networth Country \\\n",
"0 1 Elon Musk 219.0 United States \n",
"1 2 Jeff Bezos 171.0 United States \n",
"2 3 Bernard Arnault & family 158.0 France \n",
"3 4 Bill Gates 129.0 United States \n",
"4 5 Warren Buffett 118.0 United States \n",
"\n",
" Source Industry Age_category \n",
"0 Tesla, SpaceX Automotive 50-60 \n",
"1 Amazon Technology 50-60 \n",
"2 LVMH Fashion & Retail 70-80 \n",
"3 Microsoft Technology 60-70 \n",
"4 Berkshire Hathaway Finance & Investments 80+ \n"
]
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"df = pd.read_csv(\"C://Users//annal//aim//static//csv//Forbes_Billionaires.csv\")\n",
"\n",
"bins = [0, 30, 40, 50, 60, 70, 80, 101] # границы для возрастных категорий\n",
"labels = ['Under 30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'] # метки для категорий\n",
"\n",
"df[\"Age_category\"] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)\n",
"# Удаляем оригинальные колонки 'country', 'industry' и 'source' из исходного DataFrame\n",
"df.drop(columns=['Age'], inplace=True)\n",
"\n",
"# Просмотр результата\n",
"print(df.head())"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training LogisticRegression...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:320: UserWarning: The total space of parameters 3 is smaller than n_iter=10. Running 3 iterations. For exhaustive searches, use GridSearchCV.\n",
" warnings.warn(\n"
]
},
{
"ename": "ValueError",
"evalue": "\nAll the 15 fits failed.\nIt is very likely that your model is misconfigured.\nYou can try to debug the error by setting error_score='raise'.\n\nBelow are more details about the failures:\n--------------------------------------------------------------------------------\n15 fits failed with the following error:\nTraceback (most recent call last):\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\pandas\\core\\indexes\\base.py\", line 3805, in get_loc\n return self._engine.get_loc(casted_key)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"index.pyx\", line 167, in pandas._libs.index.IndexEngine.get_loc\n File \"index.pyx\", line 196, in pandas._libs.index.IndexEngine.get_loc\n File \"pandas\\\\_libs\\\\hashtable_class_helper.pxi\", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item\n File \"pandas\\\\_libs\\\\hashtable_class_helper.pxi\", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item\nKeyError: 'Age'\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\utils\\_indexing.py\", line 361, in _get_column_indices\n col_idx = all_columns.get_loc(col)\n ^^^^^^^^^^^^^^^^^^^^^^^^\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\pandas\\core\\indexes\\base.py\", line 3812, in get_loc\n raise KeyError(key) from err\nKeyError: 'Age'\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 888, in _fit_and_score\n estimator.fit(X_train, y_train, **fit_params)\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\base.py\", line 1473, in wrapper\n return fit_method(estimator, *args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py\", line 469, in fit\n Xt = self._fit(X, y, routed_params)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py\", line 406, in _fit\n X, fitted_transformer = fit_transform_one_cached(\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\joblib\\memory.py\", line 312, in __call__\n return self.func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py\", line 1310, in _fit_transform_one\n res = transformer.fit_transform(X, y, **params.get(\"fit_transform\", {}))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\utils\\_set_output.py\", line 316, in wrapped\n data_to_wrap = f(self, X, *args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\base.py\", line 1473, in wrapper\n return fit_method(estimator, *args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py\", line 968, in fit_transform\n self._validate_column_callables(X)\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py\", line 536, in _validate_column_callables\n transformer_to_input_indices[name] = _get_column_indices(X, columns)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\utils\\_indexing.py\", line 369, in _get_column_indices\n raise ValueError(\"A given column is not a column of the dataframe\") from e\nValueError: A given column is not a column of the dataframe\n",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[18], line 48\u001b[0m\n\u001b[0;32m 46\u001b[0m param_grid \u001b[38;5;241m=\u001b[39m param_grids_classification[name]\n\u001b[0;32m 47\u001b[0m grid_search \u001b[38;5;241m=\u001b[39m RandomizedSearchCV(pipeline, param_grid, cv\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5\u001b[39m, scoring\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mf1\u001b[39m\u001b[38;5;124m'\u001b[39m, n_jobs\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m---> 48\u001b[0m \u001b[43mgrid_search\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train_clf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train_clf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 50\u001b[0m \u001b[38;5;66;03m# Лучшая модель\u001b[39;00m\n\u001b[0;32m 51\u001b[0m best_model \u001b[38;5;241m=\u001b[39m grid_search\u001b[38;5;241m.\u001b[39mbest_estimator_\n",
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:1019\u001b[0m, in \u001b[0;36mBaseSearchCV.fit\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 1013\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_format_results(\n\u001b[0;32m 1014\u001b[0m all_candidate_params, n_splits, all_out, all_more_results\n\u001b[0;32m 1015\u001b[0m )\n\u001b[0;32m 1017\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m results\n\u001b[1;32m-> 1019\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_search\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevaluate_candidates\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1021\u001b[0m \u001b[38;5;66;03m# multimetric is determined here because in the case of a callable\u001b[39;00m\n\u001b[0;32m 1022\u001b[0m \u001b[38;5;66;03m# self.scoring the return type is only known after calling\u001b[39;00m\n\u001b[0;32m 1023\u001b[0m first_test_score \u001b[38;5;241m=\u001b[39m all_out[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_scores\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:1960\u001b[0m, in \u001b[0;36mRandomizedSearchCV._run_search\u001b[1;34m(self, evaluate_candidates)\u001b[0m\n\u001b[0;32m 1958\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_run_search\u001b[39m(\u001b[38;5;28mself\u001b[39m, evaluate_candidates):\n\u001b[0;32m 1959\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Search n_iter candidates from param_distributions\"\"\"\u001b[39;00m\n\u001b[1;32m-> 1960\u001b[0m \u001b[43mevaluate_candidates\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1961\u001b[0m \u001b[43m \u001b[49m\u001b[43mParameterSampler\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1962\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparam_distributions\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mn_iter\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrandom_state\u001b[49m\n\u001b[0;32m 1963\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1964\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:996\u001b[0m, in \u001b[0;36mBaseSearchCV.fit.<locals>.evaluate_candidates\u001b[1;34m(candidate_params, cv, more_results)\u001b[0m\n\u001b[0;32m 989\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(out) \u001b[38;5;241m!=\u001b[39m n_candidates \u001b[38;5;241m*\u001b[39m n_splits:\n\u001b[0;32m 990\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 991\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcv.split and cv.get_n_splits returned \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 992\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minconsistent results. Expected \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 993\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msplits, got \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(n_splits, \u001b[38;5;28mlen\u001b[39m(out) \u001b[38;5;241m/\u001b[39m\u001b[38;5;241m/\u001b[39m n_candidates)\n\u001b[0;32m 994\u001b[0m )\n\u001b[1;32m--> 996\u001b[0m \u001b[43m_warn_or_raise_about_fit_failures\u001b[49m\u001b[43m(\u001b[49m\u001b[43mout\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43merror_score\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 998\u001b[0m \u001b[38;5;66;03m# For callable self.scoring, the return type is only know after\u001b[39;00m\n\u001b[0;32m 999\u001b[0m \u001b[38;5;66;03m# calling. If the return type is a dictionary, the error scores\u001b[39;00m\n\u001b[0;32m 1000\u001b[0m \u001b[38;5;66;03m# can now be inserted with the correct key. The type checking\u001b[39;00m\n\u001b[0;32m 1001\u001b[0m \u001b[38;5;66;03m# of out will be done in `_insert_error_scores`.\u001b[39;00m\n\u001b[0;32m 1002\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcallable\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscoring):\n",
"File \u001b[1;32mc:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:529\u001b[0m, in \u001b[0;36m_warn_or_raise_about_fit_failures\u001b[1;34m(results, error_score)\u001b[0m\n\u001b[0;32m 522\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m num_failed_fits \u001b[38;5;241m==\u001b[39m num_fits:\n\u001b[0;32m 523\u001b[0m all_fits_failed_message \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 524\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mAll the \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnum_fits\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m fits failed.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 525\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIt is very likely that your model is misconfigured.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 526\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYou can try to debug the error by setting error_score=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mraise\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 527\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBelow are more details about the failures:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mfit_errors_summary\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 528\u001b[0m )\n\u001b[1;32m--> 529\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(all_fits_failed_message)\n\u001b[0;32m 531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 532\u001b[0m some_fits_failed_message \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 533\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mnum_failed_fits\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m fits failed out of a total of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnum_fits\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 534\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe score on these train-test partitions for these parameters\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 538\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBelow are more details about the failures:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mfit_errors_summary\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 539\u001b[0m )\n",
"\u001b[1;31mValueError\u001b[0m: \nAll the 15 fits failed.\nIt is very likely that your model is misconfigured.\nYou can try to debug the error by setting error_score='raise'.\n\nBelow are more details about the failures:\n--------------------------------------------------------------------------------\n15 fits failed with the following error:\nTraceback (most recent call last):\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\pandas\\core\\indexes\\base.py\", line 3805, in get_loc\n return self._engine.get_loc(casted_key)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"index.pyx\", line 167, in pandas._libs.index.IndexEngine.get_loc\n File \"index.pyx\", line 196, in pandas._libs.index.IndexEngine.get_loc\n File \"pandas\\\\_libs\\\\hashtable_class_helper.pxi\", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item\n File \"pandas\\\\_libs\\\\hashtable_class_helper.pxi\", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item\nKeyError: 'Age'\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\utils\\_indexing.py\", line 361, in _get_column_indices\n col_idx = all_columns.get_loc(col)\n ^^^^^^^^^^^^^^^^^^^^^^^^\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\pandas\\core\\indexes\\base.py\", line 3812, in get_loc\n raise KeyError(key) from err\nKeyError: 'Age'\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 888, in _fit_and_score\n estimator.fit(X_train, y_train, **fit_params)\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\base.py\", line 1473, in wrapper\n return fit_method(estimator, *args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py\", line 469, in fit\n Xt = self._fit(X, y, routed_params)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py\", line 406, in _fit\n X, fitted_transformer = fit_transform_one_cached(\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\joblib\\memory.py\", line 312, in __call__\n return self.func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\pipeline.py\", line 1310, in _fit_transform_one\n res = transformer.fit_transform(X, y, **params.get(\"fit_transform\", {}))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\utils\\_set_output.py\", line 316, in wrapped\n data_to_wrap = f(self, X, *args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\base.py\", line 1473, in wrapper\n return fit_method(estimator, *args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py\", line 968, in fit_transform\n self._validate_column_callables(X)\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py\", line 536, in _validate_column_callables\n transformer_to_input_indices[name] = _get_column_indices(X, columns)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\sklearn\\utils\\_indexing.py\", line 369, in _get_column_indices\n raise ValueError(\"A given column is not a column of the dataframe\") from e\nValueError: A given column is not a column of the dataframe\n"
]
}
],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.metrics import accuracy_score, confusion_matrix, f1_score\n",
"\n",
"X = df.drop(columns=['Age_category','Rank ', 'Name']) # Признаки\n",
"# Целевая переменная для классификации\n",
"y_class = df['Age_category'] \n",
"\n",
"# Разделение данных\n",
"X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X, y_class, test_size=0.2, random_state=42)\n",
"\n",
"# Модели и параметры\n",
"models_classification = {\n",
" \"LogisticRegression\": LogisticRegression(max_iter=1000),\n",
" \"RandomForestClassifier\": RandomForestClassifier(random_state=42),\n",
" \"KNN\": KNeighborsClassifier()\n",
"}\n",
"\n",
"param_grids_classification = {\n",
" \"LogisticRegression\": {\n",
" 'model__C': [0.1, 1, 10]\n",
" },\n",
" \"RandomForestClassifier\": {\n",
" \"model__n_estimators\": [10, 20, 30, 40, 50, 100, 150, 200, 250, 500],\n",
" \"model__max_features\": [\"sqrt\", \"log2\", 2],\n",
" \"model__max_depth\": [2, 3, 4, 5, 6, 7, 8, 9 ,10, 20],\n",
" \"model__criterion\": [\"gini\", \"entropy\", \"log_loss\"],\n",
" },\n",
" \"KNN\": {\n",
" 'model__n_neighbors': [3, 5, 7, 9, 11],\n",
" 'model__weights': ['uniform', 'distance']\n",
" }\n",
"}\n",
"\n",
"# Результаты\n",
"results_classification = {}\n",
"\n",
"# Перебор моделей\n",
"for name, model in models_classification.items():\n",
" print(f\"Training {name}...\")\n",
" pipeline = Pipeline(steps=[\n",
" ('features_preprocessing', features_preprocessing),\n",
" ('model', model)\n",
" ])\n",
" param_grid = param_grids_classification[name]\n",
" grid_search = RandomizedSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)\n",
" grid_search.fit(X_train_clf, y_train_clf)\n",
"\n",
" # Лучшая модель\n",
" best_model = grid_search.best_estimator_\n",
" y_pred = best_model.predict(X_test_clf)\n",
"\n",
" # Метрики\n",
" acc = accuracy_score(y_test_clf, y_pred)\n",
" f1 = f1_score(y_test_clf, y_pred)\n",
"\n",
" # Вычисление матрицы ошибок\n",
" c_matrix = confusion_matrix(y_test_clf, y_pred)\n",
"\n",
" # Сохранение результатов\n",
" results_classification[name] = {\n",
" \"Best Params\": grid_search.best_params_,\n",
" \"Accuracy\": acc,\n",
" \"F1 Score\": f1,\n",
" \"Confusion_matrix\": c_matrix\n",
" }\n",
"\n",
"# Печать результатов\n",
"for name, metrics in results_classification.items():\n",
" print(f\"\\nModel: {name}\")\n",
" for metric, value in metrics.items():\n",
" print(f\"{metric}: {value}\")"
]
}
],
"metadata": {