lab_4 #9
@ -2935,7 +2935,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 45,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -2986,22 +2986,15 @@
|
|||||||
"df = pd.read_csv(\"..//static//csv//Forbes Billionaires.csv\").head(100)\n",
|
"df = pd.read_csv(\"..//static//csv//Forbes Billionaires.csv\").head(100)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"df = df.dropna()\n",
|
"df = df.dropna()\n",
|
||||||
"\n",
|
|
||||||
"# Создание целевой переменной (Networth)\n",
|
|
||||||
"target = df['Networth']\n",
|
"target = df['Networth']\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Удаление целевой переменной из исходных данных\n",
|
|
||||||
"features = df.drop(columns=['Networth'])\n",
|
"features = df.drop(columns=['Networth'])\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Удаление столбцов, которые не будут использоваться (например, имена)\n",
|
|
||||||
"features = features.drop(columns=['Name'])\n",
|
"features = features.drop(columns=['Name'])\n",
|
||||||
"\n",
|
|
||||||
"# Определение столбцов для обработки\n",
|
|
||||||
"num_columns = features.select_dtypes(include=['number']).columns\n",
|
"num_columns = features.select_dtypes(include=['number']).columns\n",
|
||||||
"cat_columns = features.select_dtypes(include=['object']).columns\n",
|
"cat_columns = features.select_dtypes(include=['object']).columns\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Препроцессинг числовых столбцов\n",
|
"num_imputer = SimpleImputer(strategy=\"median\") \n",
|
||||||
"num_imputer = SimpleImputer(strategy=\"median\") # Используем медиану для заполнения пропущенных значений в числовых столбцах\n",
|
|
||||||
"num_scaler = StandardScaler()\n",
|
"num_scaler = StandardScaler()\n",
|
||||||
"preprocessing_num = Pipeline(\n",
|
"preprocessing_num = Pipeline(\n",
|
||||||
" [\n",
|
" [\n",
|
||||||
@ -3010,8 +3003,7 @@
|
|||||||
" ]\n",
|
" ]\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Препроцессинг категориальных столбцов\n",
|
"cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\") \n",
|
||||||
"cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\") # Используем 'unknown' для заполнения пропущенных значений в категориальных столбцах\n",
|
|
||||||
"cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
|
"cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
|
||||||
"preprocessing_cat = Pipeline(\n",
|
"preprocessing_cat = Pipeline(\n",
|
||||||
" [\n",
|
" [\n",
|
||||||
@ -3020,7 +3012,6 @@
|
|||||||
" ]\n",
|
" ]\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Объединение препроцессинга\n",
|
|
||||||
"features_preprocessing = ColumnTransformer(\n",
|
"features_preprocessing = ColumnTransformer(\n",
|
||||||
" verbose_feature_names_out=False,\n",
|
" verbose_feature_names_out=False,\n",
|
||||||
" transformers=[\n",
|
" transformers=[\n",
|
||||||
@ -3030,59 +3021,47 @@
|
|||||||
" remainder=\"passthrough\"\n",
|
" remainder=\"passthrough\"\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Создание финального пайплайна\n",
|
|
||||||
"pipeline_end = Pipeline(\n",
|
"pipeline_end = Pipeline(\n",
|
||||||
" [\n",
|
" [\n",
|
||||||
" (\"features_preprocessing\", features_preprocessing),\n",
|
" (\"features_preprocessing\", features_preprocessing),\n",
|
||||||
" ]\n",
|
" ]\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Разделение данных на обучающую и тестовую выборки\n",
|
|
||||||
"X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)\n",
|
"X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Применение пайплайна к данным\n",
|
|
||||||
"X_train_processed = pipeline_end.fit_transform(X_train)\n",
|
"X_train_processed = pipeline_end.fit_transform(X_train)\n",
|
||||||
"X_test_processed = pipeline_end.transform(X_test)\n",
|
"X_test_processed = pipeline_end.transform(X_test)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# 1. Настройка параметров для старых значений\n",
|
|
||||||
"old_param_grid = {\n",
|
"old_param_grid = {\n",
|
||||||
" 'n_estimators': [50, 100, 200], # Количество деревьев\n",
|
" 'n_estimators': [50, 100, 200], \n",
|
||||||
" 'max_depth': [None, 10, 20, 30], # Максимальная глубина дерева\n",
|
" 'max_depth': [None, 10, 20, 30],\n",
|
||||||
" 'min_samples_split': [2, 5, 10] # Минимальное количество образцов для разбиения узла\n",
|
" 'min_samples_split': [2, 5, 10] \n",
|
||||||
"}\n",
|
"}\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Подбор гиперпараметров с помощью Grid Search для старых параметров\n",
|
|
||||||
"old_grid_search = GridSearchCV(estimator=RandomForestRegressor(), \n",
|
"old_grid_search = GridSearchCV(estimator=RandomForestRegressor(), \n",
|
||||||
" param_grid=old_param_grid,\n",
|
" param_grid=old_param_grid,\n",
|
||||||
" scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=2)\n",
|
" scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=2)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Обучение модели на тренировочных данных\n",
|
|
||||||
"old_grid_search.fit(X_train_processed, y_train)\n",
|
"old_grid_search.fit(X_train_processed, y_train)\n",
|
||||||
"\n",
|
|
||||||
"# 2. Результаты подбора для старых параметров\n",
|
|
||||||
"old_best_params = old_grid_search.best_params_\n",
|
"old_best_params = old_grid_search.best_params_\n",
|
||||||
"old_best_mse = -old_grid_search.best_score_ # Меняем знак, так как берем отрицательное значение MSE\n",
|
"old_best_mse = -old_grid_search.best_score_ \n",
|
||||||
"\n",
|
|
||||||
"# 3. Настройка параметров для новых значений\n",
|
|
||||||
"new_param_grid = {\n",
|
"new_param_grid = {\n",
|
||||||
" 'n_estimators': [200],\n",
|
" 'n_estimators': [200],\n",
|
||||||
" 'max_depth': [10],\n",
|
" 'max_depth': [10],\n",
|
||||||
" 'min_samples_split': [10]\n",
|
" 'min_samples_split': [10]\n",
|
||||||
"}\n",
|
"}\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Подбор гиперпараметров с помощью Grid Search для новых параметров\n",
|
|
||||||
"new_grid_search = GridSearchCV(estimator=RandomForestRegressor(), \n",
|
"new_grid_search = GridSearchCV(estimator=RandomForestRegressor(), \n",
|
||||||
" param_grid=new_param_grid,\n",
|
" param_grid=new_param_grid,\n",
|
||||||
" scoring='neg_mean_squared_error', cv=2)\n",
|
" scoring='neg_mean_squared_error', cv=2)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Обучение модели на тренировочных данных\n",
|
|
||||||
"new_grid_search.fit(X_train_processed, y_train)\n",
|
"new_grid_search.fit(X_train_processed, y_train)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# 4. Результаты подбора для новых параметров\n",
|
"# Результаты подбора для новых параметров\n",
|
||||||
"new_best_params = new_grid_search.best_params_\n",
|
"new_best_params = new_grid_search.best_params_\n",
|
||||||
"new_best_mse = -new_grid_search.best_score_ # Меняем знак, так как берем отрицательное значение MSE\n",
|
"new_best_mse = -new_grid_search.best_score_ # Меняем знак, так как берем отрицательное значение MSE\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# 5. Обучение модели с лучшими параметрами для новых значений\n",
|
"# Обучение модели с лучшими параметрами для новых значений\n",
|
||||||
"model_best = RandomForestRegressor(**new_best_params)\n",
|
"model_best = RandomForestRegressor(**new_best_params)\n",
|
||||||
"model_best.fit(X_train_processed, y_train)\n",
|
"model_best.fit(X_train_processed, y_train)\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -3101,11 +3080,9 @@
|
|||||||
"print(\"Среднеквадратическая ошибка (MSE) на тестовых данных:\", mse)\n",
|
"print(\"Среднеквадратическая ошибка (MSE) на тестовых данных:\", mse)\n",
|
||||||
"print(\"Корень среднеквадратичной ошибки (RMSE) на тестовых данных:\", rmse)\n",
|
"print(\"Корень среднеквадратичной ошибки (RMSE) на тестовых данных:\", rmse)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Обучение модели с лучшими параметрами для старых значений\n",
|
|
||||||
"model_old = RandomForestRegressor(**old_best_params)\n",
|
"model_old = RandomForestRegressor(**old_best_params)\n",
|
||||||
"model_old.fit(X_train_processed, y_train)\n",
|
"model_old.fit(X_train_processed, y_train)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Прогнозирование на тестовой выборке для старых параметров\n",
|
|
||||||
"y_pred_old = model_old.predict(X_test_processed)\n",
|
"y_pred_old = model_old.predict(X_test_processed)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Визуализация ошибок\n",
|
"# Визуализация ошибок\n",
|
||||||
|
Loading…
Reference in New Issue
Block a user