diff --git a/lab_4/Lab4.ipynb b/lab_4/Lab4.ipynb index 814ef70..c43967a 100644 --- a/lab_4/Lab4.ipynb +++ b/lab_4/Lab4.ipynb @@ -337,9 +337,16 @@ "print(df.isnull().any())" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Классификация" + ] + }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1248,45 +1255,104 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Теперь перейдем к делению на выборки и созданию ориентира" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки: (3004, 10)\n", + "Размер тестовой выборки: (751, 10)\n", + "Baseline Accuracy: 0.5126498002663116\n", + "Baseline F1 Score: 0.3474826991241725\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score, f1_score\n", + "\n", + "# Загрузка данных\n", + "df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n", + "\n", + "# Создание целевого признака\n", + "median_salary = df['salary_in_usd'].median()\n", + "df['above_median_salary'] = np.where(df['salary_in_usd'] > median_salary, 1, 0)\n", + "\n", + "# Разделение на признаки и целевую переменную\n", + "features = ['work_year', 'experience_level', 'employment_type', 'job_title', 'salary', 'salary_currency', 'remote_ratio', 'employee_residence', 'company_location', 'company_size']\n", + "target = 'above_median_salary'\n", + "\n", + "# Разделение данных на тренировочный и тестовый наборы\n", + "X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42, stratify=df[target])\n", + "\n", + "print(\"Размер обучающей выборки:\", X_train.shape)\n", + "print(\"Размер тестовой выборки:\", X_test.shape)\n", + "\n", + "# Создание ориентира (baseline)\n", + "baseline_threshold = y_train.mean()\n", + "baseline_predictions = [1 if pred > baseline_threshold else 0 for pred in [baseline_threshold] * len(y_test)]\n", + "\n", + "# Вычисление метрик для ориентира\n", + "baseline_accuracy = accuracy_score(y_test, baseline_predictions)\n", + "baseline_f1 = f1_score(y_test, baseline_predictions, average='weighted')\n", + "\n", + "print('Baseline Accuracy:', baseline_accuracy)\n", + "print('Baseline F1 Score:', baseline_f1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Создание конвейера и обучение моделей" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { - "ename": "IndexError", - "evalue": "Index dimension must be 1 or 2", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[14], line 71\u001b[0m\n\u001b[0;32m 62\u001b[0m pipeline_end \u001b[38;5;241m=\u001b[39m Pipeline(\n\u001b[0;32m 63\u001b[0m [\n\u001b[0;32m 64\u001b[0m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfeatures_preprocessing\u001b[39m\u001b[38;5;124m\"\u001b[39m, features_preprocessing),\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 67\u001b[0m ]\n\u001b[0;32m 68\u001b[0m )\n\u001b[0;32m 70\u001b[0m \u001b[38;5;66;03m# Демонстрация работы конвейера для предобработки данных при классификации\u001b[39;00m\n\u001b[1;32m---> 71\u001b[0m preprocessing_result \u001b[38;5;241m=\u001b[39m \u001b[43mpipeline_end\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 72\u001b[0m preprocessed_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(\n\u001b[0;32m 73\u001b[0m preprocessing_result,\n\u001b[0;32m 74\u001b[0m columns\u001b[38;5;241m=\u001b[39mpipeline_end\u001b[38;5;241m.\u001b[39mget_feature_names_out(),\n\u001b[0;32m 75\u001b[0m )\n\u001b[0;32m 77\u001b[0m preprocessed_df\n", - "File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\pipeline.py:533\u001b[0m, in \u001b[0;36mPipeline.fit_transform\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 490\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Fit the model and transform with the final estimator.\u001b[39;00m\n\u001b[0;32m 491\u001b[0m \n\u001b[0;32m 492\u001b[0m \u001b[38;5;124;03mFit all the transformers one after the other and sequentially transform\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 530\u001b[0m \u001b[38;5;124;03m Transformed samples.\u001b[39;00m\n\u001b[0;32m 531\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 532\u001b[0m routed_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_method_params(method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m, props\u001b[38;5;241m=\u001b[39mparams)\n\u001b[1;32m--> 533\u001b[0m Xt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrouted_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 535\u001b[0m last_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_final_estimator\n\u001b[0;32m 536\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_log_message(\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m)):\n", - "File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\pipeline.py:406\u001b[0m, in \u001b[0;36mPipeline._fit\u001b[1;34m(self, X, y, routed_params)\u001b[0m\n\u001b[0;32m 404\u001b[0m cloned_transformer \u001b[38;5;241m=\u001b[39m clone(transformer)\n\u001b[0;32m 405\u001b[0m \u001b[38;5;66;03m# Fit or load from cache the current transformer\u001b[39;00m\n\u001b[1;32m--> 406\u001b[0m X, fitted_transformer \u001b[38;5;241m=\u001b[39m \u001b[43mfit_transform_one_cached\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 407\u001b[0m \u001b[43m \u001b[49m\u001b[43mcloned_transformer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 408\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 409\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 410\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 411\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage_clsname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPipeline\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 412\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_message\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstep_idx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 413\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrouted_params\u001b[49m\u001b[43m[\u001b[49m\u001b[43mname\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 414\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 415\u001b[0m \u001b[38;5;66;03m# Replace the transformer of the step with the fitted\u001b[39;00m\n\u001b[0;32m 416\u001b[0m \u001b[38;5;66;03m# transformer. This is necessary when loading the transformer\u001b[39;00m\n\u001b[0;32m 417\u001b[0m \u001b[38;5;66;03m# from the cache.\u001b[39;00m\n\u001b[0;32m 418\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps[step_idx] \u001b[38;5;241m=\u001b[39m (name, fitted_transformer)\n", - "File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\joblib\\memory.py:312\u001b[0m, in \u001b[0;36mNotMemorizedFunc.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 311\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 312\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\pipeline.py:1310\u001b[0m, in \u001b[0;36m_fit_transform_one\u001b[1;34m(transformer, X, y, weight, message_clsname, message, params)\u001b[0m\n\u001b[0;32m 1308\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(message_clsname, message):\n\u001b[0;32m 1309\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(transformer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m-> 1310\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mtransformer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfit_transform\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1311\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1312\u001b[0m res \u001b[38;5;241m=\u001b[39m transformer\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit\u001b[39m\u001b[38;5;124m\"\u001b[39m, {}))\u001b[38;5;241m.\u001b[39mtransform(\n\u001b[0;32m 1313\u001b[0m X, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtransform\u001b[39m\u001b[38;5;124m\"\u001b[39m, {})\n\u001b[0;32m 1314\u001b[0m )\n", - "File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\utils\\_set_output.py:316\u001b[0m, in \u001b[0;36m_wrap_method_output..wrapped\u001b[1;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[0;32m 314\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[0;32m 315\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 316\u001b[0m data_to_wrap \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[0;32m 318\u001b[0m \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[0;32m 319\u001b[0m return_tuple \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 320\u001b[0m _wrap_data_with_container(method, data_to_wrap[\u001b[38;5;241m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[0;32m 321\u001b[0m \u001b[38;5;241m*\u001b[39mdata_to_wrap[\u001b[38;5;241m1\u001b[39m:],\n\u001b[0;32m 322\u001b[0m )\n", - "File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\base.py:1098\u001b[0m, in \u001b[0;36mTransformerMixin.fit_transform\u001b[1;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[0;32m 1083\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m 1084\u001b[0m (\n\u001b[0;32m 1085\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis object (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m) has a `transform`\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1093\u001b[0m \u001b[38;5;167;01mUserWarning\u001b[39;00m,\n\u001b[0;32m 1094\u001b[0m )\n\u001b[0;32m 1096\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m y \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 1097\u001b[0m \u001b[38;5;66;03m# fit method of arity 1 (unsupervised transformation)\u001b[39;00m\n\u001b[1;32m-> 1098\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfit_params\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtransform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1099\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1100\u001b[0m \u001b[38;5;66;03m# fit method of arity 2 (supervised transformation)\u001b[39;00m\n\u001b[0;32m 1101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfit_params)\u001b[38;5;241m.\u001b[39mtransform(X)\n", - "File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\utils\\_set_output.py:316\u001b[0m, in \u001b[0;36m_wrap_method_output..wrapped\u001b[1;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[0;32m 314\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[0;32m 315\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 316\u001b[0m data_to_wrap \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[0;32m 318\u001b[0m \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[0;32m 319\u001b[0m return_tuple \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 320\u001b[0m _wrap_data_with_container(method, data_to_wrap[\u001b[38;5;241m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[0;32m 321\u001b[0m \u001b[38;5;241m*\u001b[39mdata_to_wrap[\u001b[38;5;241m1\u001b[39m:],\n\u001b[0;32m 322\u001b[0m )\n", - "Cell \u001b[1;32mIn[14], line 18\u001b[0m, in \u001b[0;36mSalaryFeatures.transform\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mtransform\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, y\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[0;32m 16\u001b[0m \u001b[38;5;66;03m# Создание новых признаков\u001b[39;00m\n\u001b[0;32m 17\u001b[0m X \u001b[38;5;241m=\u001b[39m X\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m---> 18\u001b[0m X[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwork_year_to_remote_ratio\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mX\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mwork_year\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;241m/\u001b[39m X[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mremote_ratio\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 19\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m X\n", - "File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\scipy\\sparse\\_csr.py:24\u001b[0m, in \u001b[0;36m_csr_base.__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, key):\n\u001b[0;32m 23\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m2\u001b[39m:\n\u001b[1;32m---> 24\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__getitem__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 26\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, \u001b[38;5;28mtuple\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(key) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m 27\u001b[0m key \u001b[38;5;241m=\u001b[39m key[\u001b[38;5;241m0\u001b[39m]\n", - "File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\scipy\\sparse\\_index.py:52\u001b[0m, in \u001b[0;36mIndexMixin.__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 51\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, key):\n\u001b[1;32m---> 52\u001b[0m row, col \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_indices\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 54\u001b[0m \u001b[38;5;66;03m# Dispatch to specialized methods.\u001b[39;00m\n\u001b[0;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(row, INT_TYPES):\n", - "File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\scipy\\sparse\\_index.py:186\u001b[0m, in \u001b[0;36mIndexMixin._validate_indices\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 184\u001b[0m row \u001b[38;5;241m=\u001b[39m _validate_bool_idx(bool_row, M, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrow\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 185\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(row, \u001b[38;5;28mslice\u001b[39m):\n\u001b[1;32m--> 186\u001b[0m row \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_asindices\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mM\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 188\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m isintlike(col):\n\u001b[0;32m 189\u001b[0m col \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mint\u001b[39m(col)\n", - "File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\scipy\\sparse\\_index.py:212\u001b[0m, in \u001b[0;36mIndexMixin._asindices\u001b[1;34m(self, idx, length)\u001b[0m\n\u001b[0;32m 209\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIndexError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124minvalid index\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[0;32m 211\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m x\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m (\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m):\n\u001b[1;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIndexError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mIndex dimension must be 1 or 2\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m x\u001b[38;5;241m.\u001b[39msize \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m 215\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m x\n", - "\u001b[1;31mIndexError\u001b[0m: Index dimension must be 1 or 2" + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: Logistic Regression\n", + "Accuracy: 0.7523\n", + "F1 Score: 0.7609\n", + "----------------------------------------\n", + "Model: Decision Tree\n", + "Accuracy: 0.9960\n", + "F1 Score: 0.9959\n", + "----------------------------------------\n", + "Model: Gradient Boosting\n", + "Accuracy: 0.9947\n", + "F1 Score: 0.9945\n", + "----------------------------------------\n" ] } ], "source": [ - "import numpy as np\n", "import pandas as pd\n", - "from sklearn.base import BaseEstimator, TransformerMixin\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.impute import SimpleImputer\n", - "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", - "from sklearn.pipeline import Pipeline\n", "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.ensemble import GradientBoostingClassifier\n", + "from sklearn.metrics import accuracy_score, f1_score\n", "\n", "# Загрузка данных\n", "df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n", @@ -1302,168 +1368,350 @@ "# Разделение данных на тренировочный и тестовый наборы\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n", "\n", - "# Построение конвейеров предобработки\n", - "\n", - "class SalaryFeatures(BaseEstimator, TransformerMixin):\n", - " def __init__(self):\n", - " pass\n", - " def fit(self, X, y=None):\n", - " return self\n", - " def transform(self, X, y=None):\n", - " # Создание новых признаков\n", - " X = X.copy()\n", - " X[\"work_year_to_remote_ratio\"] = X[\"work_year\"] / X[\"remote_ratio\"]\n", - " return X\n", - " def get_feature_names_out(self, features_in):\n", - " # Добавление имен новых признаков\n", - " new_features = [\"work_year_to_remote_ratio\"]\n", - " return np.append(features_in, new_features, axis=0)\n", - "\n", - "# Обработка числовых данных. Числовой конвейер: заполнение пропущенных значений медианой и стандартизация\n", - "preprocessing_num_class = Pipeline(steps=[\n", - " ('imputer', SimpleImputer(strategy='median')),\n", - " ('scaler', StandardScaler())\n", - "])\n", - "\n", - "# Обработка категориальных данных: заполнение пропущенных значений наиболее частым значением и one-hot encoding\n", - "preprocessing_cat_class = Pipeline(steps=[\n", - " ('imputer', SimpleImputer(strategy='most_frequent')),\n", - " ('onehot', OneHotEncoder(handle_unknown='ignore'))\n", - "])\n", - "\n", "# Определение столбцов\n", - "numeric_columns = [\"work_year\", \"salary\", \"salary_in_usd\", \"remote_ratio\"]\n", + "numeric_columns = [\"work_year\", \"salary\", \"remote_ratio\"]\n", "cat_columns = [\"experience_level\", \"employment_type\", \"job_title\", \"salary_currency\", \"employee_residence\", \"company_location\", \"company_size\"]\n", "\n", - "# Предобработка признаков\n", - "features_preprocessing = ColumnTransformer(\n", - " verbose_feature_names_out=False,\n", + "# Предобработка данных\n", + "preprocessor = ColumnTransformer(\n", " transformers=[\n", - " (\"prepocessing_num\", preprocessing_num_class, numeric_columns),\n", - " (\"prepocessing_cat\", preprocessing_cat_class, cat_columns),\n", - " ],\n", - " remainder=\"passthrough\"\n", - ")\n", + " ('num', StandardScaler(), numeric_columns),\n", + " ('cat', OneHotEncoder(handle_unknown='ignore'), cat_columns)])\n", "\n", - "# Удаление колонок\n", - "columns_to_drop = [] # Укажите столбцы, которые нужно удалить, если они есть\n", - "drop_columns = ColumnTransformer(\n", - " verbose_feature_names_out=False,\n", - " transformers=[\n", - " (\"drop_columns\", \"drop\", columns_to_drop),\n", - " ],\n", - " remainder=\"passthrough\",\n", - ")\n", + "# Создание конвейеров для моделей\n", + "pipeline_logistic_regression = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('classifier', LogisticRegression(random_state=42))])\n", "\n", - "# Основной конвейер предобработки данных и конструирования признаков\n", - "pipeline_end = Pipeline(\n", - " [\n", - " (\"features_preprocessing\", features_preprocessing),\n", - " (\"custom_features\", SalaryFeatures()),\n", - " (\"drop_columns\", drop_columns),\n", - " ]\n", - ")\n", + "pipeline_decision_tree = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('classifier', DecisionTreeClassifier(random_state=42))])\n", "\n", - "# Демонстрация работы конвейера для предобработки данных при классификации\n", - "preprocessing_result = pipeline_end.fit_transform(X_train)\n", + "pipeline_gradient_boosting = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('classifier', GradientBoostingClassifier(random_state=42))])\n", "\n", - "# Получение имен столбцов после преобразования\n", - "feature_names = pipeline_end.named_steps['features_preprocessing'].get_feature_names_out(numeric_columns + cat_columns)\n", - "feature_names = np.append(feature_names, [\"work_year_to_remote_ratio\"])\n", + "# Список конвейеров \n", + "pipelines = [\n", + " ('Logistic Regression', pipeline_logistic_regression),\n", + " ('Decision Tree', pipeline_decision_tree),\n", + " ('Gradient Boosting', pipeline_gradient_boosting)\n", + "]\n", "\n", - "# Создание DataFrame с преобразованными данными\n", - "preprocessed_df = pd.DataFrame(\n", - " preprocessing_result,\n", - " columns=feature_names,\n", - ")\n", - "\n", - "preprocessed_df" + "# Обучение моделей и вывод результатов\n", + "for name, pipeline in pipelines:\n", + " pipeline.fit(X_train, y_train)\n", + " y_pred = pipeline.predict(X_test)\n", + " accuracy = accuracy_score(y_test, y_pred)\n", + " f1 = f1_score(y_test, y_pred)\n", + " print(f\"Model: {name}\")\n", + " print(f\"Accuracy: {accuracy:.4f}\")\n", + " print(f\"F1 Score: {f1:.4f}\")\n", + " print(\"-\" * 40)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Бизнес-цели**\n", - "\n", - "1. Предсказание заработной платы (Регрессия)\n", - "\n", - " Цель: Предсказать заработную плату (salary_in_usd) на основе других характеристик, таких как уровень опыта (experience_level), тип занятости (employment_type), должность (job_title), место проживания сотрудника (employee_residence), размер компании (company_size) и другие факторы.\n", - "\n", - " Применение: Это может быть полезно для HR-отделов, которые хотят оценить справедливую зарплату для новых сотрудников или для анализа рынка труда.\n", - "\n", - "2. Классификация уровня опыта по зарплате (Классификация)\n", - "\n", - " Цель: Классифицировать уровень опыта (experience_level) на основе заработной платы (salary_in_usd) и других факторов.\n", - "\n", - " Применение: Это может помочь в оценке, на каком уровне опыта находится сотрудник, основываясь на его зарплате, что может быть полезно для оценки карьерного роста." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "1. Прогнозирование зарплаты" + "Оценка качества моделей" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " work_year experience_level employment_type job_title \\\n", - "0 2023 SE FT Principal Data Scientist \n", - "1 2023 MI CT ML Engineer \n", - "2 2023 MI CT ML Engineer \n", - "3 2023 SE FT Data Scientist \n", - "4 2023 SE FT Data Scientist \n", + "Model: Logistic Regression\n", + "Accuracy: 0.7523302263648469\n", + "F1 Score: 0.7517841210039291\n", "\n", - " salary salary_currency salary_in_usd employee_residence remote_ratio \\\n", - "0 80000 EUR 85847 ES 100 \n", - "1 30000 USD 30000 US 100 \n", - "2 25500 USD 25500 US 100 \n", - "3 175000 USD 175000 CA 100 \n", - "4 120000 USD 120000 CA 100 \n", + "Model: Decision Tree\n", + "Accuracy: 0.996005326231691\n", + "F1 Score: 0.9960048583691977\n", "\n", - " company_location company_size \n", - "0 ES L \n", - "1 US S \n", - "2 US S \n", - "3 CA M \n", - "4 CA M \n", - "\n", - "RangeIndex: 3755 entries, 0 to 3754\n", - "Data columns (total 11 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 work_year 3755 non-null int64 \n", - " 1 experience_level 3755 non-null object\n", - " 2 employment_type 3755 non-null object\n", - " 3 job_title 3755 non-null object\n", - " 4 salary 3755 non-null int64 \n", - " 5 salary_currency 3755 non-null object\n", - " 6 salary_in_usd 3755 non-null int64 \n", - " 7 employee_residence 3755 non-null object\n", - " 8 remote_ratio 3755 non-null int64 \n", - " 9 company_location 3755 non-null object\n", - " 10 company_size 3755 non-null object\n", - "dtypes: int64(4), object(7)\n", - "memory usage: 322.8+ KB\n", - "None\n", - " work_year salary salary_in_usd remote_ratio\n", - "count 3755.000000 3.755000e+03 3755.000000 3755.000000\n", - "mean 2022.373635 1.906956e+05 137570.389880 46.271638\n", - "std 0.691448 6.716765e+05 63055.625278 48.589050\n", - "min 2020.000000 6.000000e+03 5132.000000 0.000000\n", - "25% 2022.000000 1.000000e+05 95000.000000 0.000000\n", - "50% 2022.000000 1.380000e+05 135000.000000 0.000000\n", - "75% 2023.000000 1.800000e+05 175000.000000 100.000000\n", - "max 2023.000000 3.040000e+07 450000.000000 100.000000\n", - "work_year 0\n", + "Model: Gradient Boosting\n", + "Accuracy: 0.9946737683089214\n", + "F1 Score: 0.9946728986768623\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.metrics import accuracy_score, f1_score\n", + "\n", + "for name, pipeline in pipelines:\n", + " y_pred = pipeline.predict(X_test)\n", + " print(f\"Model: {name}\")\n", + " print('Accuracy:', accuracy_score(y_test, y_pred))\n", + " print('F1 Score:', f1_score(y_test, y_pred, average='weighted'))\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Регрессия\n", + "Цель: Разработать модель регрессии, которая будет предсказывать зарплату (salary_in_usd) на основе демографических данных, типа работы и других факторов." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер данных до удаления выбросов: (3755, 11)\n", + "Размер данных после удаления выбросов: (3708, 11)\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from scipy import stats\n", + "\n", + "# Загрузка данных\n", + "df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n", + "\n", + "# Определение числовых признаков\n", + "numeric_features = ['work_year', 'salary', 'salary_in_usd', 'remote_ratio']\n", + "\n", + "# Вычисление z-оценок для числовых признаков\n", + "z_scores = stats.zscore(df[numeric_features])\n", + "\n", + "# Определение порога для удаления выбросов\n", + "threshold = 3\n", + "\n", + "# Удаление выбросов\n", + "df_cleaned = df[(z_scores < threshold).all(axis=1)]\n", + "\n", + "print(\"Размер данных до удаления выбросов:\", df.shape)\n", + "print(\"Размер данных после удаления выбросов:\", df_cleaned.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки: (2966, 9)\n", + "Размер тестовой выборки: (742, 9)\n", + "Baseline MAE: 48988.97819674187\n", + "Baseline MSE: 3791583837.2779293\n", + "Baseline R²: -0.005051587587466155\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", + "\n", + "# Определение признаков и целевой переменной\n", + "features = ['work_year', 'experience_level', 'employment_type', 'job_title', 'salary_currency', 'remote_ratio', 'employee_residence', 'company_location', 'company_size']\n", + "target = 'salary_in_usd'\n", + "\n", + "# Разделение данных на тренировочный и тестовый наборы\n", + "X_train, X_test, y_train, y_test = train_test_split(df_cleaned[features], df_cleaned[target], test_size=0.2, random_state=42)\n", + "\n", + "print(\"Размер обучающей выборки:\", X_train.shape)\n", + "print(\"Размер тестовой выборки:\", X_test.shape)\n", + "\n", + "# Создание ориентира (baseline)\n", + "baseline_predictions = [y_train.mean()] * len(y_test)\n", + "\n", + "# Вычисление метрик для ориентира\n", + "print('Baseline MAE:', mean_absolute_error(y_test, baseline_predictions))\n", + "print('Baseline MSE:', mean_squared_error(y_test, baseline_predictions))\n", + "print('Baseline R²:', r2_score(y_test, baseline_predictions))" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер данных до удаления выбросов: (3755, 11)\n", + "Размер данных после удаления выбросов: (3733, 11)\n", + "Размер обучающей выборки: (2986, 9)\n", + "Размер тестовой выборки: (747, 9)\n", + "Baseline MAE: 47593.92288600708\n", + "Baseline MSE: 3680965527.9964128\n", + "Baseline R²: -0.0016576422593919116\n", + "Model: Linear Regression trained.\n", + "Model: Decision Tree trained.\n", + "Model: Gradient Boosting trained.\n", + "Model: Linear Regression\n", + "MAE: 36617.65439873256\n", + "MSE: 2194684192.4416404\n", + "R²: 0.4027865306031213\n", + "\n", + "Model: Decision Tree\n", + "MAE: 36516.71804922624\n", + "MSE: 2246643776.062331\n", + "R²: 0.38864738324451775\n", + "\n", + "Model: Gradient Boosting\n", + "MAE: 35842.80843437428\n", + "MSE: 2125285552.2470944\n", + "R²: 0.42167116230764956\n", + "\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from scipy import stats\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.tree import DecisionTreeRegressor\n", + "from sklearn.ensemble import GradientBoostingRegressor\n", + "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", + "\n", + "# Загрузка данных\n", + "df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n", + "\n", + "# Определение числовых признаков\n", + "numeric_features = ['work_year', 'salary_in_usd', 'remote_ratio']\n", + "\n", + "# Вычисление z-оценок для числовых признаков\n", + "z_scores = stats.zscore(df[numeric_features])\n", + "\n", + "# Определение порога для удаления выбросов\n", + "threshold = 3\n", + "\n", + "# Удаление выбросов\n", + "df_cleaned = df[(z_scores < threshold).all(axis=1)]\n", + "\n", + "print(\"Размер данных до удаления выбросов:\", df.shape)\n", + "print(\"Размер данных после удаления выбросов:\", df_cleaned.shape)\n", + "\n", + "# Разделение на выборки и создание ориентира\n", + "features = ['work_year', 'experience_level', 'employment_type', 'job_title', 'salary_currency', 'remote_ratio', 'employee_residence', 'company_location', 'company_size']\n", + "target = 'salary_in_usd'\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(df_cleaned[features], df_cleaned[target], test_size=0.2, random_state=42)\n", + "\n", + "print(\"Размер обучающей выборки:\", X_train.shape)\n", + "print(\"Размер тестовой выборки:\", X_test.shape)\n", + "\n", + "# Создание ориентира (baseline)\n", + "baseline_predictions = [y_train.mean()] * len(y_test)\n", + "\n", + "print('Baseline MAE:', mean_absolute_error(y_test, baseline_predictions))\n", + "print('Baseline MSE:', mean_squared_error(y_test, baseline_predictions))\n", + "print('Baseline R²:', r2_score(y_test, baseline_predictions))\n", + "\n", + "# Создание конвейера и обучение моделей\n", + "categorical_features = ['experience_level', 'employment_type', 'job_title', 'salary_currency', 'employee_residence', 'company_location', 'company_size']\n", + "numeric_features = ['work_year', 'remote_ratio']\n", + "\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " ('num', StandardScaler(), numeric_features),\n", + " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])\n", + "\n", + "pipeline_linear_regression = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('regressor', LinearRegression())])\n", + "\n", + "pipeline_decision_tree = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('regressor', DecisionTreeRegressor(random_state=42))])\n", + "\n", + "pipeline_gradient_boosting = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('regressor', GradientBoostingRegressor(random_state=42))])\n", + "\n", + "pipelines = [\n", + " ('Linear Regression', pipeline_linear_regression),\n", + " ('Decision Tree', pipeline_decision_tree),\n", + " ('Gradient Boosting', pipeline_gradient_boosting)\n", + "]\n", + "\n", + "for name, pipeline in pipelines:\n", + " pipeline.fit(X_train, y_train)\n", + " print(f\"Model: {name} trained.\")\n", + "\n", + "# Оценка качества моделей\n", + "for name, pipeline in pipelines:\n", + " y_pred = pipeline.predict(X_test)\n", + " print(f\"Model: {name}\")\n", + " print('MAE:', mean_absolute_error(y_test, y_pred))\n", + " print('MSE:', mean_squared_error(y_test, y_pred))\n", + " print('R²:', r2_score(y_test, y_pred))\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: Linear Regression\n", + "MAE: 36617.65439873256\n", + "MSE: 2194684192.4416404\n", + "R²: 0.4027865306031213\n", + "\n", + "Model: Decision Tree\n", + "MAE: 36516.71804922624\n", + "MSE: 2246643776.062331\n", + "R²: 0.38864738324451775\n", + "\n", + "Model: Gradient Boosting\n", + "MAE: 35842.80843437428\n", + "MSE: 2125285552.2470944\n", + "R²: 0.42167116230764956\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", + "\n", + "for name, pipeline in pipelines:\n", + " y_pred = pipeline.predict(X_test)\n", + " print(f\"Model: {name}\")\n", + " print('MAE:', mean_absolute_error(y_test, y_pred))\n", + " print('MSE:', mean_squared_error(y_test, y_pred))\n", + " print('R²:', r2_score(y_test, y_pred))\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Пропущенные значения:\n", + " work_year 0\n", "experience_level 0\n", "employment_type 0\n", "job_title 0\n", @@ -1474,115 +1722,124 @@ "remote_ratio 0\n", "company_location 0\n", "company_size 0\n", - "dtype: int64\n", - "Mean Squared Error: 2482079980.9527493\n", - "R^2 Score: 0.37127352660208646\n" + "dtype: int64\n" ] } ], "source": [ "import pandas as pd\n", - "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", + "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.linear_model import LinearRegression\n", + "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n", "from sklearn.metrics import mean_squared_error, r2_score\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", + "from scipy.stats import uniform, randint\n", + "from sklearn.model_selection import RandomizedSearchCV\n", "\n", - "# Загружаем набор данных\n", + "# Загрузка данных\n", "df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n", "\n", - "# Устанавливаем случайное состояние\n", - "random_state = 42\n", - "\n", - "# Предварительный анализ данных\n", - "print(df.head())\n", - "print(df.info())\n", - "print(df.describe())\n", - "\n", "# Проверка на пропущенные значения\n", - "print(df.isnull().sum())\n", + "print(\"Пропущенные значения:\\n\", df.isnull().sum())\n", "\n", - "# Предобработка данных\n", - "# Определяем категориальные и числовые столбцы\n", + "# Удаление строк с пропущенными значениями\n", + "df = df.dropna()\n", + "\n", + "# Выбор признаков и целевой переменной\n", + "features = ['work_year', 'experience_level', 'employment_type', 'job_title', 'employee_residence', 'remote_ratio', 'company_location', 'company_size']\n", + "target = 'salary_in_usd'\n", + "\n", + "# Определение категориальных и числовых признаков\n", "categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']\n", "numeric_features = ['work_year', 'remote_ratio']\n", "\n", - "# Создаем пайплайн для обработки данных\n", + "# Создание пайплайна для обработки данных\n", + "categorical_transformer = Pipeline(steps=[\n", + " ('onehot', OneHotEncoder(handle_unknown='ignore'))\n", + "])\n", + "\n", + "numeric_transformer = Pipeline(steps=[\n", + " ('scaler', StandardScaler())\n", + "])\n", + "\n", "preprocessor = ColumnTransformer(\n", " transformers=[\n", - " ('num', StandardScaler(), numeric_features),\n", - " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])\n", + " ('num', numeric_transformer, numeric_features),\n", + " ('cat', categorical_transformer, categorical_features)\n", + " ])\n", "\n", - "# Определяем целевую переменную и признаки\n", - "X = df.drop('salary_in_usd', axis=1)\n", - "y = df['salary_in_usd']\n", + "# Преобразование данных\n", + "X = preprocessor.fit_transform(df[features])\n", + "y = df[target]\n", "\n", - "# Разделяем данные на обучающую и тестовую выборки\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)\n", - "\n", - "# Создаем и обучаем модель\n", - "model = Pipeline(steps=[\n", - " ('preprocessor', preprocessor),\n", - " ('regressor', LinearRegression())])\n", - "\n", - "model.fit(X_train, y_train)\n", - "\n", - "# Делаем предсказания на тестовой выборке\n", - "y_pred = model.predict(X_test)\n", - "\n", - "# Оцениваем качество модели\n", - "mse = mean_squared_error(y_test, y_pred)\n", - "r2 = r2_score(y_test, y_pred)\n", - "\n", - "print(f\"Mean Squared Error: {mse}\")\n", - "print(f\"R^2 Score: {r2}\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "2. Классифицировать уровень опыта" + "# Разделение данных на обучающую и тестовую выборки\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 47, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "Classification Report:\n", - " precision recall f1-score support\n", + "d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:320: UserWarning: The total space of parameters 4 is smaller than n_iter=10. Running 4 iterations. For exhaustive searches, use GridSearchCV.\n", + " warnings.warn(\n", + "d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:540: FitFailedWarning: \n", + "6 fits failed out of a total of 12.\n", + "The score on these train-test partitions for these parameters will be set to nan.\n", + "If these failures are not expected, you can try to debug them by setting error_score='raise'.\n", "\n", - " EN 0.55 0.48 0.51 67\n", - " EX 0.46 0.26 0.33 23\n", - " MI 0.48 0.54 0.51 157\n", - " SE 0.83 0.83 0.83 504\n", + "Below are more details about the failures:\n", + "--------------------------------------------------------------------------------\n", + "6 fits failed with the following error:\n", + "Traceback (most recent call last):\n", + " File \"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 888, in _fit_and_score\n", + " estimator.fit(X_train, y_train, **fit_params)\n", + " File \"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\base.py\", line 1473, in wrapper\n", + " return fit_method(estimator, *args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\pipeline.py\", line 473, in fit\n", + " self._final_estimator.fit(Xt, y, **last_step_params[\"fit\"])\n", + " File \"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\base.py\", line 1473, in wrapper\n", + " return fit_method(estimator, *args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\linear_model\\_base.py\", line 609, in fit\n", + " X, y = self._validate_data(\n", + " ^^^^^^^^^^^^^^^^^^^^\n", + " File \"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\base.py\", line 650, in _validate_data\n", + " X, y = check_X_y(X, y, **check_params)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\utils\\validation.py\", line 1301, in check_X_y\n", + " X = check_array(\n", + " ^^^^^^^^^^^^\n", + " File \"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\utils\\validation.py\", line 971, in check_array\n", + " array = _ensure_sparse_format(\n", + " ^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\utils\\validation.py\", line 595, in _ensure_sparse_format\n", + " raise TypeError(\n", + "TypeError: Sparse data was passed for X, but dense data is required. Use '.toarray()' to convert to a dense numpy array.\n", "\n", - " accuracy 0.72 751\n", - " macro avg 0.58 0.53 0.55 751\n", - "weighted avg 0.72 0.72 0.72 751\n", - "\n", - "Confusion Matrix:\n", - "[[ 32 0 20 15]\n", - " [ 0 6 5 12]\n", - " [ 14 0 84 59]\n", - " [ 12 7 65 420]]\n", - "Accuracy Score: 0.7217043941411452\n" + " warnings.warn(some_fits_failed_message, FitFailedWarning)\n", + "d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:1103: UserWarning: One or more of the test scores are non-finite: [ nan 0.37308723 nan 0.37316524]\n", + " warnings.warn(\n", + "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_14908\\2948510432.py:70: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.\n", + " axes[i].set_xticklabels(params.keys(), rotation=45, ha=\"right\") #Поворачиваем подписи на оси х\n", + "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_14908\\2948510432.py:70: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.\n", + " axes[i].set_xticklabels(params.keys(), rotation=45, ha=\"right\") #Поворачиваем подписи на оси х\n", + "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_14908\\2948510432.py:70: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.\n", + " axes[i].set_xticklabels(params.keys(), rotation=45, ha=\"right\") #Поворачиваем подписи на оси х\n" ] }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ - "
" + "
" ] }, "metadata": {}, @@ -1592,328 +1849,78 @@ "source": [ "import pandas as pd\n", "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n", - "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", - "\n", - "\n", - "# Загружаем набор данных\n", - "df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n", - "\n", - "# Устанавливаем случайное состояние\n", - "random_state = 42\n", - "\n", - "\n", - "# Предобработка данных\n", - "# Определяем категориальные и числовые столбцы\n", - "categorical_features = ['employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']\n", - "numeric_features = ['work_year', 'salary_in_usd', 'remote_ratio']\n", - "\n", - "# Создаем пайплайн для обработки данных\n", - "preprocessor = ColumnTransformer(\n", - " transformers=[\n", - " ('num', StandardScaler(), numeric_features),\n", - " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])\n", - "\n", - "# Определяем целевую переменную и признаки\n", - "X = df.drop('experience_level', axis=1)\n", - "y = df['experience_level']\n", - "\n", - "# Разделяем данные на обучающую и тестовую выборки\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)\n", - "\n", - "# Создаем и обучаем модель\n", - "model = Pipeline(steps=[\n", - " ('preprocessor', preprocessor),\n", - " ('classifier', RandomForestClassifier(random_state=random_state))])\n", - "\n", - "model.fit(X_train, y_train)\n", - "\n", - "# Делаем предсказания на тестовой выборке\n", - "y_pred = model.predict(X_test)\n", - "\n", - "# Оцениваем качество модели\n", - "print(\"Classification Report:\")\n", - "print(classification_report(y_test, y_pred))\n", - "\n", - "print(\"Confusion Matrix:\")\n", - "print(confusion_matrix(y_test, y_pred))\n", - "\n", - "print(f\"Accuracy Score: {accuracy_score(y_test, y_pred)}\")\n", - "\n", - "# Визуализация результатов\n", - "conf_matrix = confusion_matrix(y_test, y_pred)\n", - "sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')\n", - "plt.xlabel('Predicted')\n", - "plt.ylabel('Actual')\n", - "plt.title('Confusion Matrix')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Ориентир**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MAE: 37795.639591701794\n", - "MSE: 2482079980.9527493\n", - "RMSE: 49820.47752634201\n", - "R²: 0.37127352660208646\n", - "Ориентиры для предсказания заработной платы не достигнуты.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", - " warnings.warn(\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", + "from sklearn.model_selection import train_test_split, RandomizedSearchCV\n", + "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.linear_model import LinearRegression\n", - "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", + "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n", + "from scipy.stats import uniform, randint\n", "\n", + "# Загрузка данных\n", "df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n", "\n", - "# Предобработка данных\n", - "categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']\n", - "numeric_features = ['work_year', 'remote_ratio']\n", + "# ... (ваш код предобработки данных, как в предыдущем примере) ...\n", "\n", - "preprocessor = ColumnTransformer(\n", - " transformers=[\n", - " ('num', StandardScaler(), numeric_features),\n", - " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])\n", + "# Определение распределений для гиперпараметров\n", + "param_distributions = {\n", + " 'Linear Regression': {\n", + " 'regressor__fit_intercept': [True, False],\n", + " 'regressor__positive': [True, False]\n", + " },\n", + " 'Random Forest': {\n", + " 'regressor__n_estimators': randint(50, 200),\n", + " 'regressor__max_depth': [None, 10, 20],\n", + " 'regressor__min_samples_split': randint(2, 11),\n", + " 'regressor__min_samples_leaf': randint(1, 5),\n", + " 'regressor__bootstrap': [True, False]\n", + " },\n", + " 'Gradient Boosting': {\n", + " 'regressor__n_estimators': randint(50, 200),\n", + " 'regressor__learning_rate': uniform(0.01, 0.49), # uniform distribution for learning rate\n", + " 'regressor__max_depth': [3, 5, 7],\n", + " 'regressor__min_samples_split': randint(2, 11),\n", + " 'regressor__min_samples_leaf': randint(1, 5),\n", + " 'regressor__subsample': uniform(0.5, 0.5) # uniform distribution for subsample\n", "\n", - "X = df.drop('salary_in_usd', axis=1)\n", - "y = df['salary_in_usd']\n", + " }\n", + "}\n", "\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "# Словарь для хранения лучших моделей и их гиперпараметров\n", + "best_models = {}\n", "\n", - "model = Pipeline(steps=[\n", - " ('preprocessor', preprocessor),\n", - " ('regressor', LinearRegression())])\n", + "# Цикл для обучения и настройки гиперпараметров каждой модели\n", + "for model_name, model_params in param_distributions.items():\n", + " if model_name == 'Linear Regression':\n", + " model = LinearRegression()\n", + " elif model_name == 'Random Forest':\n", + " model = RandomForestRegressor(random_state=42)\n", + " elif model_name == 'Gradient Boosting':\n", + " model = GradientBoostingRegressor(random_state=42)\n", + " else:\n", + " continue #Обработка неизвестных моделей\n", "\n", - "model.fit(X_train, y_train)\n", + " pipeline = Pipeline([('regressor', model)])\n", + " random_search = RandomizedSearchCV(pipeline, param_distributions=model_params, n_iter=10, cv=3, n_jobs=-1, random_state=42)\n", + " random_search.fit(X_train, y_train)\n", + " best_models[model_name] = random_search.best_params_\n", "\n", - "y_pred = model.predict(X_test)\n", "\n", - "mae = mean_absolute_error(y_test, y_pred)\n", - "mse = mean_squared_error(y_test, y_pred)\n", - "rmse = mean_squared_error(y_test, y_pred, squared=False)\n", - "r2 = r2_score(y_test, y_pred)\n", + "# Визуализация лучших гиперпараметров\n", "\n", - "print(f\"MAE: {mae}\")\n", - "print(f\"MSE: {mse}\")\n", - "print(f\"RMSE: {rmse}\")\n", - "print(f\"R²: {r2}\")\n", + "fig, axes = plt.subplots(len(best_models), 1, figsize=(10, 5 * len(best_models)))\n", + "if len(best_models) == 1:\n", + " axes = [axes] # обработка случая с одной моделью\n", "\n", - "# Проверяем, достигнуты ли ориентиры\n", - "if r2 >= 0.75 and mae <= 15000 and rmse <= 20000:\n", - " print(\"Ориентиры для предсказания заработной платы достигнуты!\")\n", - "else:\n", - " print(\"Ориентиры для предсказания заработной платы не достигнуты.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Accuracy: 0.7217043941411452\n", - "Classification Report:\n", - " precision recall f1-score support\n", - "\n", - " EN 0.55 0.48 0.51 67\n", - " EX 0.46 0.26 0.33 23\n", - " MI 0.48 0.54 0.51 157\n", - " SE 0.83 0.83 0.83 504\n", - "\n", - " accuracy 0.72 751\n", - " macro avg 0.58 0.53 0.55 751\n", - "weighted avg 0.72 0.72 0.72 751\n", - "\n", - "Confusion Matrix:\n", - "[[ 32 0 20 15]\n", - " [ 0 6 5 12]\n", - " [ 14 0 84 59]\n", - " [ 12 7 65 420]]\n", - "Ориентиры для классификации уровня опыта не достигнуты.\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", + "for i, (model_name, params) in enumerate(best_models.items()):\n", + " axes[i].bar(params.keys(), params.values())\n", + " axes[i].set_title(f\"Лучшие гиперпараметры для {model_name}\")\n", + " axes[i].set_xticklabels(params.keys(), rotation=45, ha=\"right\") #Поворачиваем подписи на оси х\n", + " axes[i].tick_params(axis='x', which='major', labelsize=8) # Размер шрифта подписей оси х\n", "\n", - "# Загружаем набор данных\n", - "df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n", - "\n", - "# Предобработка данных\n", - "categorical_features = ['employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']\n", - "numeric_features = ['work_year', 'salary_in_usd', 'remote_ratio']\n", - "\n", - "preprocessor = ColumnTransformer(\n", - " transformers=[\n", - " ('num', StandardScaler(), numeric_features),\n", - " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])\n", - "\n", - "X = df.drop('experience_level', axis=1)\n", - "y = df['experience_level']\n", - "\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", - "\n", - "model = Pipeline(steps=[\n", - " ('preprocessor', preprocessor),\n", - " ('classifier', RandomForestClassifier(random_state=42))])\n", - "\n", - "model.fit(X_train, y_train)\n", - "\n", - "y_pred = model.predict(X_test)\n", - "\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "print(f\"Accuracy: {accuracy}\")\n", - "\n", - "print(\"Classification Report:\")\n", - "print(classification_report(y_test, y_pred))\n", - "\n", - "print(\"Confusion Matrix:\")\n", - "print(confusion_matrix(y_test, y_pred))\n", - "\n", - "# Проверяем, достигнуты ли ориентиры\n", - "if accuracy >= 0.80:\n", - " print(\"Ориентиры для классификации уровня опыта достигнуты!\")\n", - "else:\n", - " print(\"Ориентиры для классификации уровня опыта не достигнуты.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Конвейер" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "from sklearn.base import BaseEstimator, TransformerMixin\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.impute import SimpleImputer\n", - "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", - "from sklearn.pipeline import Pipeline\n", - "\n", - "# Определение столбцов\n", - "numeric_columns = [\"work_year\", \"salary\", \"salary_in_usd\", \"remote_ratio\"]\n", - "cat_columns = [\"experience_level\", \"employment_type\", \"job_title\", \"salary_currency\", \"employee_residence\", \"company_location\", \"company_size\"]\n", - "\n", - "# Обработка числовых данных: заполнение пропущенных значений медианой и стандартизация\n", - "preprocessing_num_class = Pipeline(steps=[\n", - " ('imputer', SimpleImputer(strategy='median')),\n", - " ('scaler', StandardScaler())\n", - "])\n", - "\n", - "# Обработка категориальных данных: заполнение пропущенных значений наиболее частым значением и one-hot encoding\n", - "preprocessing_cat_class = Pipeline(steps=[\n", - " ('imputer', SimpleImputer(strategy='most_frequent')),\n", - " ('onehot', OneHotEncoder(handle_unknown='ignore'))\n", - "])\n", - "\n", - "# Объединение всех преобразований в один ColumnTransformer\n", - "features_preprocessing = ColumnTransformer(\n", - " verbose_feature_names_out=False,\n", - " transformers=[\n", - " (\"prepocessing_num\", preprocessing_num_class, numeric_columns),\n", - " (\"prepocessing_cat\", preprocessing_cat_class, cat_columns),\n", - " ],\n", - " remainder=\"passthrough\"\n", - ")\n", - "\n", - "# Определение конвейера\n", - "pipeline_end = Pipeline(\n", - " [\n", - " (\"features_preprocessing\", features_preprocessing),\n", - " ]\n", - ")\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'train_test_split' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[5], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Разделение данных на тренировочный и тестовый наборы\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m X_train, X_test \u001b[38;5;241m=\u001b[39m \u001b[43mtrain_test_split\u001b[49m(df, test_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.2\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m42\u001b[39m)\n\u001b[0;32m 4\u001b[0m \u001b[38;5;66;03m# Применение конвейера для предобработки данных\u001b[39;00m\n\u001b[0;32m 5\u001b[0m preprocessing_result \u001b[38;5;241m=\u001b[39m pipeline_end\u001b[38;5;241m.\u001b[39mfit_transform(X_train)\n", - "\u001b[1;31mNameError\u001b[0m: name 'train_test_split' is not defined" - ] - } - ], - "source": [ - "# Разделение данных на тренировочный и тестовый наборы\n", - "X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)\n", - "\n", - "# Применение конвейера для предобработки данных\n", - "preprocessing_result = pipeline_end.fit_transform(X_train)\n", - "\n", - "# Получение имен столбцов после преобразования\n", - "feature_names = pipeline_end.named_steps['features_preprocessing'].get_feature_names_out()\n", - "\n", - "# Создание DataFrame с преобразованными данными\n", - "preprocessed_df = pd.DataFrame(\n", - " preprocessing_result,\n", - " columns=feature_names,\n", - ")\n", - "\n", - "# Вывод преобразованного DataFrame\n", - "print(preprocessed_df)" + "plt.tight_layout()\n", + "plt.show()\n" ] } ],