lab_4 #9

Merged
Arutunyan-Dmitry merged 2 commits from lab_4 into main 2024-11-23 09:11:25 +04:00
Showing only changes of commit 6e5f2a5c25 - Show all commits

View File

@ -2935,7 +2935,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 45, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -2986,22 +2986,15 @@
"df = pd.read_csv(\"..//static//csv//Forbes Billionaires.csv\").head(100)\n", "df = pd.read_csv(\"..//static//csv//Forbes Billionaires.csv\").head(100)\n",
"\n", "\n",
"df = df.dropna()\n", "df = df.dropna()\n",
"\n",
"# Создание целевой переменной (Networth)\n",
"target = df['Networth']\n", "target = df['Networth']\n",
"\n", "\n",
"# Удаление целевой переменной из исходных данных\n",
"features = df.drop(columns=['Networth'])\n", "features = df.drop(columns=['Networth'])\n",
"\n", "\n",
"# Удаление столбцов, которые не будут использоваться (например, имена)\n",
"features = features.drop(columns=['Name'])\n", "features = features.drop(columns=['Name'])\n",
"\n",
"# Определение столбцов для обработки\n",
"num_columns = features.select_dtypes(include=['number']).columns\n", "num_columns = features.select_dtypes(include=['number']).columns\n",
"cat_columns = features.select_dtypes(include=['object']).columns\n", "cat_columns = features.select_dtypes(include=['object']).columns\n",
"\n", "\n",
"# Препроцессинг числовых столбцов\n", "num_imputer = SimpleImputer(strategy=\"median\") \n",
"num_imputer = SimpleImputer(strategy=\"median\") # Используем медиану для заполнения пропущенных значений в числовых столбцах\n",
"num_scaler = StandardScaler()\n", "num_scaler = StandardScaler()\n",
"preprocessing_num = Pipeline(\n", "preprocessing_num = Pipeline(\n",
" [\n", " [\n",
@ -3010,8 +3003,7 @@
" ]\n", " ]\n",
")\n", ")\n",
"\n", "\n",
"# Препроцессинг категориальных столбцов\n", "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\") \n",
"cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\") # Используем 'unknown' для заполнения пропущенных значений в категориальных столбцах\n",
"cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n", "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
"preprocessing_cat = Pipeline(\n", "preprocessing_cat = Pipeline(\n",
" [\n", " [\n",
@ -3020,7 +3012,6 @@
" ]\n", " ]\n",
")\n", ")\n",
"\n", "\n",
"# Объединение препроцессинга\n",
"features_preprocessing = ColumnTransformer(\n", "features_preprocessing = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n", " verbose_feature_names_out=False,\n",
" transformers=[\n", " transformers=[\n",
@ -3030,59 +3021,47 @@
" remainder=\"passthrough\"\n", " remainder=\"passthrough\"\n",
")\n", ")\n",
"\n", "\n",
"# Создание финального пайплайна\n",
"pipeline_end = Pipeline(\n", "pipeline_end = Pipeline(\n",
" [\n", " [\n",
" (\"features_preprocessing\", features_preprocessing),\n", " (\"features_preprocessing\", features_preprocessing),\n",
" ]\n", " ]\n",
")\n", ")\n",
"\n", "\n",
"# Разделение данных на обучающую и тестовую выборки\n",
"X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)\n", "X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)\n",
"\n", "\n",
"# Применение пайплайна к данным\n",
"X_train_processed = pipeline_end.fit_transform(X_train)\n", "X_train_processed = pipeline_end.fit_transform(X_train)\n",
"X_test_processed = pipeline_end.transform(X_test)\n", "X_test_processed = pipeline_end.transform(X_test)\n",
"\n", "\n",
"# 1. Настройка параметров для старых значений\n",
"old_param_grid = {\n", "old_param_grid = {\n",
" 'n_estimators': [50, 100, 200], # Количество деревьев\n", " 'n_estimators': [50, 100, 200], \n",
" 'max_depth': [None, 10, 20, 30], # Максимальная глубина дерева\n", " 'max_depth': [None, 10, 20, 30],\n",
" 'min_samples_split': [2, 5, 10] # Минимальное количество образцов для разбиения узла\n", " 'min_samples_split': [2, 5, 10] \n",
"}\n", "}\n",
"\n", "\n",
"# Подбор гиперпараметров с помощью Grid Search для старых параметров\n",
"old_grid_search = GridSearchCV(estimator=RandomForestRegressor(), \n", "old_grid_search = GridSearchCV(estimator=RandomForestRegressor(), \n",
" param_grid=old_param_grid,\n", " param_grid=old_param_grid,\n",
" scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=2)\n", " scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=2)\n",
"\n", "\n",
"# Обучение модели на тренировочных данных\n",
"old_grid_search.fit(X_train_processed, y_train)\n", "old_grid_search.fit(X_train_processed, y_train)\n",
"\n",
"# 2. Результаты подбора для старых параметров\n",
"old_best_params = old_grid_search.best_params_\n", "old_best_params = old_grid_search.best_params_\n",
"old_best_mse = -old_grid_search.best_score_ # Меняем знак, так как берем отрицательное значение MSE\n", "old_best_mse = -old_grid_search.best_score_ \n",
"\n",
"# 3. Настройка параметров для новых значений\n",
"new_param_grid = {\n", "new_param_grid = {\n",
" 'n_estimators': [200],\n", " 'n_estimators': [200],\n",
" 'max_depth': [10],\n", " 'max_depth': [10],\n",
" 'min_samples_split': [10]\n", " 'min_samples_split': [10]\n",
"}\n", "}\n",
"\n", "\n",
"# Подбор гиперпараметров с помощью Grid Search для новых параметров\n",
"new_grid_search = GridSearchCV(estimator=RandomForestRegressor(), \n", "new_grid_search = GridSearchCV(estimator=RandomForestRegressor(), \n",
" param_grid=new_param_grid,\n", " param_grid=new_param_grid,\n",
" scoring='neg_mean_squared_error', cv=2)\n", " scoring='neg_mean_squared_error', cv=2)\n",
"\n", "\n",
"# Обучение модели на тренировочных данных\n",
"new_grid_search.fit(X_train_processed, y_train)\n", "new_grid_search.fit(X_train_processed, y_train)\n",
"\n", "\n",
"# 4. Результаты подбора для новых параметров\n", "# Результаты подбора для новых параметров\n",
"new_best_params = new_grid_search.best_params_\n", "new_best_params = new_grid_search.best_params_\n",
"new_best_mse = -new_grid_search.best_score_ # Меняем знак, так как берем отрицательное значение MSE\n", "new_best_mse = -new_grid_search.best_score_ # Меняем знак, так как берем отрицательное значение MSE\n",
"\n", "\n",
"# 5. Обучение модели с лучшими параметрами для новых значений\n", "# Обучение модели с лучшими параметрами для новых значений\n",
"model_best = RandomForestRegressor(**new_best_params)\n", "model_best = RandomForestRegressor(**new_best_params)\n",
"model_best.fit(X_train_processed, y_train)\n", "model_best.fit(X_train_processed, y_train)\n",
"\n", "\n",
@ -3101,11 +3080,9 @@
"print(\"Среднеквадратическая ошибка (MSE) на тестовых данных:\", mse)\n", "print(\"Среднеквадратическая ошибка (MSE) на тестовых данных:\", mse)\n",
"print(\"Корень среднеквадратичной ошибки (RMSE) на тестовых данных:\", rmse)\n", "print(\"Корень среднеквадратичной ошибки (RMSE) на тестовых данных:\", rmse)\n",
"\n", "\n",
"# Обучение модели с лучшими параметрами для старых значений\n",
"model_old = RandomForestRegressor(**old_best_params)\n", "model_old = RandomForestRegressor(**old_best_params)\n",
"model_old.fit(X_train_processed, y_train)\n", "model_old.fit(X_train_processed, y_train)\n",
"\n", "\n",
"# Прогнозирование на тестовой выборке для старых параметров\n",
"y_pred_old = model_old.predict(X_test_processed)\n", "y_pred_old = model_old.predict(X_test_processed)\n",
"\n", "\n",
"# Визуализация ошибок\n", "# Визуализация ошибок\n",