This commit is contained in:
Serxiolog 2024-10-07 21:07:16 +04:00
parent eb931a6879
commit 948b3b579c

View File

@ -10,7 +10,7 @@
},
{
"cell_type": "code",
"execution_count": 55,
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
@ -397,7 +397,7 @@
},
{
"cell_type": "code",
"execution_count": 63,
"execution_count": 69,
"metadata": {},
"outputs": [
{
@ -415,22 +415,44 @@
"-1 10\n",
" 6 7\n",
" 7 4\n",
"Name: count, dtype: int64\n",
"Обучающая выборка после oversampling: (25218, 4)\n",
"Volume_Grouped\n",
" 0 2802\n",
" 4 2802\n",
" 1 2802\n",
" 2 2802\n",
" 3 2802\n",
" 5 2802\n",
"-1 2802\n",
" 7 2802\n",
" 6 2802\n",
"Name: count, dtype: int64\n",
"Контрольная выборка: (1607, 4)\n",
"Volume_Grouped\n",
" 0 934\n",
" 1 487\n",
" 2 123\n",
" 3 37\n",
" 4 13\n",
" 5 6\n",
"-1 4\n",
" 6 2\n",
" 7 1\n",
"Name: count, dtype: int64\n",
"Тестовая выборка: (1608, 4)\n",
"Volume_Grouped\n",
" 0 934\n",
" 1 487\n",
" 2 124\n",
" 3 37\n",
" 4 14\n",
" 5 6\n",
"-1 3\n",
" 6 2\n",
" 7 1\n",
"Name: count, dtype: int64\n"
]
},
{
"ename": "RuntimeError",
"evalue": "Not any neigbours belong to the majority class. This case will induce a NaN case with a division by zero. ADASYN is not suited for this specific dataset. Use SMOTE instead.",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mRuntimeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[63], line 18\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_coffee_train\u001b[38;5;241m.\u001b[39mshape)\n\u001b[0;32m 16\u001b[0m \u001b[38;5;28mprint\u001b[39m(df_coffee_train[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVolume_Grouped\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mvalue_counts())\n\u001b[1;32m---> 18\u001b[0m X_resampled, y_resampled \u001b[38;5;241m=\u001b[39m \u001b[43mada\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_resample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_coffee_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdf_coffee_train\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mVolume_Grouped\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 19\u001b[0m df_coffee_train_adasyn \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(X_resampled)\n\u001b[0;32m 21\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка после oversampling: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_coffee_train_adasyn\u001b[38;5;241m.\u001b[39mshape)\n",
"File \u001b[1;32mc:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\imblearn\\base.py:208\u001b[0m, in \u001b[0;36mBaseSampler.fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 187\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Resample the dataset.\u001b[39;00m\n\u001b[0;32m 188\u001b[0m \n\u001b[0;32m 189\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 205\u001b[0m \u001b[38;5;124;03m The corresponding label of `X_resampled`.\u001b[39;00m\n\u001b[0;32m 206\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[1;32m--> 208\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_resample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\imblearn\\base.py:112\u001b[0m, in \u001b[0;36mSamplerMixin.fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 106\u001b[0m X, y, binarize_y \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_X_y(X, y)\n\u001b[0;32m 108\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy_ \u001b[38;5;241m=\u001b[39m check_sampling_strategy(\n\u001b[0;32m 109\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy, y, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sampling_type\n\u001b[0;32m 110\u001b[0m )\n\u001b[1;32m--> 112\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_resample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 114\u001b[0m y_ \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 115\u001b[0m label_binarize(output[\u001b[38;5;241m1\u001b[39m], classes\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39munique(y)) \u001b[38;5;28;01mif\u001b[39;00m binarize_y \u001b[38;5;28;01melse\u001b[39;00m output[\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m 116\u001b[0m )\n\u001b[0;32m 118\u001b[0m X_, y_ \u001b[38;5;241m=\u001b[39m arrays_transformer\u001b[38;5;241m.\u001b[39mtransform(output[\u001b[38;5;241m0\u001b[39m], y_)\n",
"File \u001b[1;32mc:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\imblearn\\over_sampling\\_adasyn.py:183\u001b[0m, in \u001b[0;36mADASYN._fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 181\u001b[0m ratio_nn \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39msum(y[nns] \u001b[38;5;241m!=\u001b[39m class_sample, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m) \u001b[38;5;241m/\u001b[39m n_neighbors\n\u001b[0;32m 182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m np\u001b[38;5;241m.\u001b[39msum(ratio_nn):\n\u001b[1;32m--> 183\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[0;32m 184\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNot any neigbours belong to the majority\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 185\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m class. This case will induce a NaN case\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 186\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m with a division by zero. ADASYN is not\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 187\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m suited for this specific dataset.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 188\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m Use SMOTE instead.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 189\u001b[0m )\n\u001b[0;32m 190\u001b[0m ratio_nn \u001b[38;5;241m/\u001b[39m\u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39msum(ratio_nn)\n\u001b[0;32m 191\u001b[0m n_samples_generate \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mrint(ratio_nn \u001b[38;5;241m*\u001b[39m n_samples)\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mint\u001b[39m)\n",
"\u001b[1;31mRuntimeError\u001b[0m: Not any neigbours belong to the majority class. This case will induce a NaN case with a division by zero. ADASYN is not suited for this specific dataset. Use SMOTE instead."
]
}
],
"source": [
@ -451,11 +473,11 @@
"print(\"Обучающая выборка: \", df_coffee_train.shape)\n",
"print(df_coffee_train[\"Volume_Grouped\"].value_counts())\n",
"\n",
"X_resampled, y_resampled = ada.fit_resample(df_coffee_train, df_coffee_train[\"Volume_Grouped\"])\n",
"X_resampled, y_resampled = apply_oversampling(df_coffee_train, df_coffee_train[\"Volume_Grouped\"])\n",
"df_coffee_train_adasyn = pd.DataFrame(X_resampled)\n",
"\n",
"print(\"Обучающая выборка после oversampling: \", df_coffee_train_adasyn.shape)\n",
"print(df_coffee_train_adasyn.Pclass.value_counts())\n",
"print(df_coffee_train_adasyn[\"Volume_Grouped\"].value_counts())\n",
"\n",
"print(\"Контрольная выборка: \", df_coffee_val.shape)\n",
"print(df_coffee_val[\"Volume_Grouped\"].value_counts())\n",
@ -473,7 +495,7 @@
},
{
"cell_type": "code",
"execution_count": 46,
"execution_count": 68,
"metadata": {},
"outputs": [
{
@ -489,6 +511,15 @@
" 0 20\n",
"-1 5\n",
"Name: count, dtype: int64\n",
"Обучающая выборка после oversampling: (1104, 4)\n",
"Sales_Grouped\n",
" 3 184\n",
" 1 184\n",
" 2 184\n",
" 0 184\n",
"-1 184\n",
" 4 184\n",
"Name: count, dtype: int64\n",
"Контрольная выборка: (179, 4)\n",
"Sales_Grouped\n",
" 2 61\n",
@ -528,11 +559,11 @@
"print(\"Обучающая выборка: \", df_shop_train.shape)\n",
"print(df_shop_train[\"Sales_Grouped\"].value_counts())\n",
"\n",
"X_resampled, y_resampled = ada.fit_resample(df_mark_train, df_mark_train[\"score_grouped\"])\n",
"df_mark_train_adasyn = pd.DataFrame(X_resampled)\n",
"X_resampled, y_resampled = apply_oversampling(df_shop_train, df_shop_train[\"Sales_Grouped\"])\n",
"df_shop_train_adasyn = pd.DataFrame(X_resampled)\n",
"\n",
"print(\"Обучающая выборка после oversampling: \", df_mark_train_adasyn.shape)\n",
"print(df_mark_train_adasyn.Pclass.value_counts())\n",
"print(\"Обучающая выборка после oversampling: \", df_shop_train_adasyn.shape)\n",
"print(df_shop_train_adasyn[\"Sales_Grouped\"].value_counts())\n",
"\n",
"print(\"Контрольная выборка: \", df_shop_val.shape)\n",
"print(df_shop_val[\"Sales_Grouped\"].value_counts())\n",
@ -550,7 +581,7 @@
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 67,
"metadata": {},
"outputs": [
{
@ -564,23 +595,32 @@
" 4 101\n",
" 1 31\n",
"-1 4\n",
"Name: count, dtype: int64\n",
"Обучающая выборка после oversampling: (1415, 4)\n",
"score_grouped\n",
" 2 283\n",
" 4 283\n",
" 3 283\n",
" 1 283\n",
"-1 283\n",
"Name: count, dtype: int64\n",
"Контрольная выборка: (200, 4)\n",
"score_grouped\n",
" 3 95\n",
" 2 61\n",
" 4 33\n",
" 1 10\n",
"-1 1\n",
"Name: count, dtype: int64\n",
"Тестовая выборка: (200, 4)\n",
"score_grouped\n",
" 3 94\n",
" 2 60\n",
" 4 34\n",
" 1 11\n",
"-1 1\n",
"Name: count, dtype: int64\n"
]
},
{
"ename": "ValueError",
"evalue": "Expected n_neighbors <= n_samples_fit, but n_neighbors = 6, n_samples_fit = 4, n_samples = 4",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[58], line 23\u001b[0m\n\u001b[0;32m 20\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_mark_train\u001b[38;5;241m.\u001b[39mshape)\n\u001b[0;32m 21\u001b[0m \u001b[38;5;28mprint\u001b[39m(df_mark_train[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mscore_grouped\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mvalue_counts())\n\u001b[1;32m---> 23\u001b[0m X_resampled, y_resampled \u001b[38;5;241m=\u001b[39m \u001b[43mada\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_resample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_mark_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdf_mark_train\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mscore_grouped\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 24\u001b[0m df_mark_train_adasyn \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(X_resampled)\n\u001b[0;32m 26\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка после oversampling: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_mark_train_adasyn\u001b[38;5;241m.\u001b[39mshape)\n",
"File \u001b[1;32mc:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\imblearn\\base.py:208\u001b[0m, in \u001b[0;36mBaseSampler.fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 187\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Resample the dataset.\u001b[39;00m\n\u001b[0;32m 188\u001b[0m \n\u001b[0;32m 189\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 205\u001b[0m \u001b[38;5;124;03m The corresponding label of `X_resampled`.\u001b[39;00m\n\u001b[0;32m 206\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[1;32m--> 208\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_resample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\imblearn\\base.py:112\u001b[0m, in \u001b[0;36mSamplerMixin.fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 106\u001b[0m X, y, binarize_y \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_X_y(X, y)\n\u001b[0;32m 108\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy_ \u001b[38;5;241m=\u001b[39m check_sampling_strategy(\n\u001b[0;32m 109\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy, y, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sampling_type\n\u001b[0;32m 110\u001b[0m )\n\u001b[1;32m--> 112\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_resample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 114\u001b[0m y_ \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 115\u001b[0m label_binarize(output[\u001b[38;5;241m1\u001b[39m], classes\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39munique(y)) \u001b[38;5;28;01mif\u001b[39;00m binarize_y \u001b[38;5;28;01melse\u001b[39;00m output[\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m 116\u001b[0m )\n\u001b[0;32m 118\u001b[0m X_, y_ \u001b[38;5;241m=\u001b[39m arrays_transformer\u001b[38;5;241m.\u001b[39mtransform(output[\u001b[38;5;241m0\u001b[39m], y_)\n",
"File \u001b[1;32mc:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\imblearn\\over_sampling\\_adasyn.py:202\u001b[0m, in \u001b[0;36mADASYN._fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 199\u001b[0m \u001b[38;5;66;03m# the nearest neighbors need to be fitted only on the current class\u001b[39;00m\n\u001b[0;32m 200\u001b[0m \u001b[38;5;66;03m# to find the class NN to generate new samples\u001b[39;00m\n\u001b[0;32m 201\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnn_\u001b[38;5;241m.\u001b[39mfit(X_class)\n\u001b[1;32m--> 202\u001b[0m nns \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnn_\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkneighbors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_class\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_distance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m[:, \u001b[38;5;241m1\u001b[39m:]\n\u001b[0;32m 204\u001b[0m enumerated_class_indices \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39marange(\u001b[38;5;28mlen\u001b[39m(target_class_indices))\n\u001b[0;32m 205\u001b[0m rows \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mrepeat(enumerated_class_indices, n_samples_generate)\n",
"File \u001b[1;32mc:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\sklearn\\neighbors\\_base.py:834\u001b[0m, in \u001b[0;36mKNeighborsMixin.kneighbors\u001b[1;34m(self, X, n_neighbors, return_distance)\u001b[0m\n\u001b[0;32m 832\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 833\u001b[0m inequality_str \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mn_neighbors <= n_samples_fit\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m--> 834\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 835\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExpected \u001b[39m\u001b[38;5;132;01m{\u001b[39;00minequality_str\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, but \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 836\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mn_neighbors = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mn_neighbors\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, n_samples_fit = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mn_samples_fit\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 837\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mn_samples = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mX\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;66;03m# include n_samples for common tests\u001b[39;00m\n\u001b[0;32m 838\u001b[0m )\n\u001b[0;32m 840\u001b[0m n_jobs \u001b[38;5;241m=\u001b[39m effective_n_jobs(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_jobs)\n\u001b[0;32m 841\u001b[0m chunked_results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
"\u001b[1;31mValueError\u001b[0m: Expected n_neighbors <= n_samples_fit, but n_neighbors = 6, n_samples_fit = 4, n_samples = 4"
]
}
],
"source": [
@ -606,11 +646,11 @@
"print(\"Обучающая выборка: \", df_mark_train.shape)\n",
"print(df_mark_train[\"score_grouped\"].value_counts())\n",
"\n",
"X_resampled, y_resampled = ada.fit_resample(df_mark_train, df_mark_train[\"score_grouped\"])\n",
"X_resampled, y_resampled = apply_oversampling(df_mark_train, df_mark_train[\"score_grouped\"])\n",
"df_mark_train_adasyn = pd.DataFrame(X_resampled)\n",
"\n",
"print(\"Обучающая выборка после oversampling: \", df_mark_train_adasyn.shape)\n",
"print(df_mark_train_adasyn.Pclass.value_counts())\n",
"print(df_mark_train_adasyn[\"score_grouped\"].value_counts())\n",
"\n",
"print(\"Контрольная выборка: \", df_mark_val.shape)\n",
"print(df_mark_val[\"score_grouped\"].value_counts())\n",