Lab hard
This commit is contained in:
parent
e4d193f30c
commit
a2a0639a29
@ -9,7 +9,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 52,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -17,36 +17,42 @@
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"RangeIndex: 100000 entries, 0 to 99999\n",
|
||||
"RangeIndex: 10000 entries, 0 to 9999\n",
|
||||
"Data columns (total 10 columns):\n",
|
||||
" # Column Non-Null Count Dtype \n",
|
||||
"--- ------ -------------- ----- \n",
|
||||
" 0 Id 100000 non-null object \n",
|
||||
" 1 Name 100000 non-null object \n",
|
||||
" 2 Short description 99923 non-null object \n",
|
||||
" 3 Gender 98015 non-null object \n",
|
||||
" 4 Country 94533 non-null object \n",
|
||||
" 5 Occupation 97299 non-null object \n",
|
||||
" 6 Birth year 100000 non-null int64 \n",
|
||||
" 7 Death year 99999 non-null float64\n",
|
||||
" 8 Manner of death 14821 non-null object \n",
|
||||
" 9 Age of death 99999 non-null float64\n",
|
||||
" 0 Id 10000 non-null object \n",
|
||||
" 1 Name 10000 non-null object \n",
|
||||
" 2 Short description 9996 non-null object \n",
|
||||
" 3 Gender 9927 non-null object \n",
|
||||
" 4 Country 9721 non-null object \n",
|
||||
" 5 Occupation 9836 non-null object \n",
|
||||
" 6 Birth year 10000 non-null int64 \n",
|
||||
" 7 Death year 9999 non-null float64\n",
|
||||
" 8 Manner of death 1893 non-null object \n",
|
||||
" 9 Age of death 9999 non-null float64\n",
|
||||
"dtypes: float64(2), int64(1), object(7)\n",
|
||||
"memory usage: 7.6+ MB\n"
|
||||
"memory usage: 781.4+ KB\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from sklearn.preprocessing import LabelEncoder\n",
|
||||
"from imblearn.over_sampling import RandomOverSampler\n",
|
||||
"from imblearn.under_sampling import RandomUnderSampler\n",
|
||||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||||
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
|
||||
"from sklearn.compose import ColumnTransformer\n",
|
||||
"from sklearn.pipeline import Pipeline\n",
|
||||
"from sklearn.impute import SimpleImputer\n",
|
||||
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
|
||||
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier\n",
|
||||
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
|
||||
"from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score\n",
|
||||
"import numpy as np\n",
|
||||
"import featuretools as ft\n",
|
||||
"\n",
|
||||
"from sklearn.metrics import accuracy_score, classification_report\n",
|
||||
"\n",
|
||||
"# Функция для применения oversampling\n",
|
||||
"def apply_oversampling(X, y):\n",
|
||||
@ -131,7 +137,7 @@
|
||||
" return df_train, df_val, df_test\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"df = pd.read_csv(\"../data/age.csv\", nrows=100000)\n",
|
||||
"df = pd.read_csv(\"../data/age.csv\", nrows=10000)\n",
|
||||
"df.info()"
|
||||
]
|
||||
},
|
||||
@ -144,8 +150,7 @@
|
||||
" 2) Исследование зависимости длительности жизни от страны проживания.\n",
|
||||
" \n",
|
||||
"Поскольку именно эти бизнес-цели были выбраны в предыдущей лабораторной работе, будем их использовать.\n",
|
||||
"Но возникает проблема с 1 целью: её невозможно использовать для машинного обучения. Заменим ее на следующую:\n",
|
||||
" Прогнозирование страны. Необходимо не имея такой параметр как страна примерно ее угадать для дальнейшей рекламы."
|
||||
"Но возникает проблема с 1 целью: её невозможно использовать для задачи классификации. Заменим ее на классификацию людей по возрастным группам, что может быть полезно для рекламных целей."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -157,7 +162,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 46,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -168,24 +173,306 @@
|
||||
"data = df.copy()\n",
|
||||
"\n",
|
||||
"value_counts = data[\"Country\"].value_counts()\n",
|
||||
"rare = value_counts[value_counts < 50].index\n",
|
||||
"data = data[~data[\"Country\"].isin(rare)]"
|
||||
"rare = value_counts[value_counts < 100].index\n",
|
||||
"data = data[~data[\"Country\"].isin(rare)]\n",
|
||||
"\n",
|
||||
"data.drop(data[~data['Gender'].isin(['Male', 'Female'])].index, inplace=True)\n",
|
||||
"\n",
|
||||
"data1 = pd.get_dummies(data, columns=['Gender', 'Country', 'Occupation'], drop_first=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Определить достижимый уровень качества модели для каждой задачи. На основе имеющихся данных уровень качества моделей не будет высоким, поскольку все таки длительность жизни лишь примерная и точно ее угадать невозможно. А угадывание страны является трудной задачей, поскольку данные между людьми, живущими в разных странах, могут совпадать между собой."
|
||||
"Определить достижимый уровень качества модели для каждой задачи. На основе имеющихся данных уровень качества моделей не будет высоким, поскольку все таки длительность жизни лишь примерная и точно ее угадать невозможно."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Выберем ориентиры для наших 2х задач:\n",
|
||||
" 1)Регрессии - средний возраст человека\n",
|
||||
" 2)Классификации - аиболее часто встречающаяся возрастная группа"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Построим конвейер."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Index(['Id', 'Name', 'Short description', 'Birth year', 'Death year',\n",
|
||||
" 'Age of death', 'Gender_Male', 'Country_France',\n",
|
||||
" 'Country_German Confederation', 'Country_German Democratic Republic',\n",
|
||||
" ...\n",
|
||||
" 'Manner of death_euthanasia', 'Manner of death_homicide',\n",
|
||||
" 'Manner of death_homicide; natural causes',\n",
|
||||
" 'Manner of death_internal bleeding', 'Manner of death_natural causes',\n",
|
||||
" 'Manner of death_suicide',\n",
|
||||
" 'Manner of death_suicide; homicide; accident',\n",
|
||||
" 'Manner of death_suicide; unfortunate accident',\n",
|
||||
" 'Manner of death_summary execution', 'Manner of death_unnatural death'],\n",
|
||||
" dtype='object', length=400)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(data.columns)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Best parameters for Linear Regression: {}\n",
|
||||
"Best parameters for Random Forest Regressor: {'model__max_depth': None, 'model__n_estimators': 100}\n",
|
||||
"Best parameters for Gradient Boosting Regressor: {'model__learning_rate': 0.2, 'model__max_depth': 7, 'model__n_estimators': 300}\n",
|
||||
"Linear Regression: MSE = 0.002807184047660083, R2 = 0.9999899555289343\n",
|
||||
"Random Forest Regressor: MSE = 11.46917740409879, R2 = 0.9589617856804076\n",
|
||||
"Gradient Boosting Regressor: MSE = 8.202651735797296, R2 = 0.9706498410424512\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"X_reg = data1.drop(['Id', 'Name', 'Age of death', 'Short description', 'Manner of death'], axis=1)\n",
|
||||
"y_reg = data1['Age of death']\n",
|
||||
"\n",
|
||||
"# Разделение данных\n",
|
||||
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
|
||||
"\n",
|
||||
"# Выбор моделей для регрессии\n",
|
||||
"models_reg = {\n",
|
||||
" 'Linear Regression': LinearRegression(),\n",
|
||||
" 'Random Forest Regressor': RandomForestRegressor(random_state=42),\n",
|
||||
" 'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42)\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Создание конвейера для регрессии\n",
|
||||
"pipelines_reg = {}\n",
|
||||
"for name, model in models_reg.items():\n",
|
||||
" pipelines_reg[name] = Pipeline([\n",
|
||||
" ('scaler', StandardScaler()),\n",
|
||||
" ('model', model)\n",
|
||||
" ])\n",
|
||||
"\n",
|
||||
"# Определение сетки гиперпараметров для регрессии\n",
|
||||
"param_grids_reg = {\n",
|
||||
" 'Linear Regression': {},\n",
|
||||
" 'Random Forest Regressor': {\n",
|
||||
" 'model__n_estimators': [100, 200, 300],\n",
|
||||
" 'model__max_depth': [None, 10, 20, 30]\n",
|
||||
" },\n",
|
||||
" 'Gradient Boosting Regressor': {\n",
|
||||
" 'model__n_estimators': [100, 200, 300],\n",
|
||||
" 'model__learning_rate': [0.01, 0.1, 0.2],\n",
|
||||
" 'model__max_depth': [3, 5, 7]\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Настройка гиперпараметров для регрессии\n",
|
||||
"best_models_reg = {}\n",
|
||||
"for name, pipeline in pipelines_reg.items():\n",
|
||||
" grid_search = GridSearchCV(pipeline, param_grids_reg[name], cv=5, scoring='neg_mean_squared_error')\n",
|
||||
" grid_search.fit(X_train_reg, y_train_reg)\n",
|
||||
" best_models_reg[name] = grid_search.best_estimator_\n",
|
||||
" print(f'Best parameters for {name}: {grid_search.best_params_}')\n",
|
||||
"\n",
|
||||
"# Обучение моделей для регрессии\n",
|
||||
"for name, model in best_models_reg.items():\n",
|
||||
" model.fit(X_train_reg, y_train_reg)\n",
|
||||
"\n",
|
||||
"# Оценка качества моделей для регрессии\n",
|
||||
"for name, model in best_models_reg.items():\n",
|
||||
" y_pred_reg = model.predict(X_test_reg)\n",
|
||||
" mse = mean_squared_error(y_test_reg, y_pred_reg)\n",
|
||||
" r2 = r2_score(y_test_reg, y_pred_reg)\n",
|
||||
" print(f'{name}: MSE = {mse}, R2 = {r2}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 50,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data['Age_Category'] = pd.cut(data['Age'], bins=[0, 29, 59, float('inf')], labels=[\"young\", \"middle-aged\", \"old\"])"
|
||||
"data2 = data.drop(['Short description', 'Manner of death', 'Gender', 'Country', 'Occupation'], axis=1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 54,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Index(['Birth year', 'Death year'], dtype='object')\n",
|
||||
"Best parameters for Logistic Regression: {'model__C': 10, 'model__solver': 'lbfgs'}\n",
|
||||
"Best parameters for Random Forest Classifier: {'model__max_depth': 30, 'model__n_estimators': 200}\n",
|
||||
"Best parameters for Gradient Boosting Classifier: {'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__n_estimators': 200}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
|
||||
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
|
||||
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Logistic Regression: Accuracy = 0.9248554913294798\n",
|
||||
"Classification Report:\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0-18 0.00 0.00 0.00 5\n",
|
||||
" 19-30 0.50 0.08 0.14 60\n",
|
||||
" 31-50 0.77 0.77 0.77 242\n",
|
||||
" 51-70 0.91 0.96 0.94 650\n",
|
||||
" 71+ 0.98 1.00 0.99 946\n",
|
||||
"\n",
|
||||
" accuracy 0.92 1903\n",
|
||||
" macro avg 0.63 0.56 0.57 1903\n",
|
||||
"weighted avg 0.91 0.92 0.91 1903\n",
|
||||
"\n",
|
||||
"Random Forest Classifier: Accuracy = 0.9485023646873357\n",
|
||||
"Classification Report:\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0-18 0.67 0.40 0.50 5\n",
|
||||
" 19-30 0.96 0.77 0.85 60\n",
|
||||
" 31-50 0.88 0.89 0.88 242\n",
|
||||
" 51-70 0.92 0.95 0.94 650\n",
|
||||
" 71+ 0.99 0.97 0.98 946\n",
|
||||
"\n",
|
||||
" accuracy 0.95 1903\n",
|
||||
" macro avg 0.88 0.80 0.83 1903\n",
|
||||
"weighted avg 0.95 0.95 0.95 1903\n",
|
||||
"\n",
|
||||
"Gradient Boosting Classifier: Accuracy = 0.9379926431949553\n",
|
||||
"Classification Report:\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0-18 1.00 0.40 0.57 5\n",
|
||||
" 19-30 0.96 0.77 0.85 60\n",
|
||||
" 31-50 0.87 0.87 0.87 242\n",
|
||||
" 51-70 0.90 0.95 0.92 650\n",
|
||||
" 71+ 0.98 0.96 0.97 946\n",
|
||||
"\n",
|
||||
" accuracy 0.94 1903\n",
|
||||
" macro avg 0.94 0.79 0.84 1903\n",
|
||||
"weighted avg 0.94 0.94 0.94 1903\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Создание возрастных групп\n",
|
||||
"bins = [0, 18, 30, 50, 70, 100]\n",
|
||||
"labels = ['0-18', '19-30', '31-50', '51-70', '71+']\n",
|
||||
"data['Age Group'] = pd.cut(data['Age of death'], bins=bins, labels=labels)\n",
|
||||
"\n",
|
||||
"# Выбор признаков и целевой переменной для классификации\n",
|
||||
"X_class = data2.drop(['Id', 'Name', 'Age of death', 'Age Group'], axis=1)\n",
|
||||
"y_class = data['Age Group'] \n",
|
||||
"print(X_class.columns)\n",
|
||||
"# Разделение данных\n",
|
||||
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
|
||||
"\n",
|
||||
"# Выбор моделей для классификации\n",
|
||||
"models_class = {\n",
|
||||
" 'Logistic Regression': LogisticRegression(random_state=42, max_iter=5000, solver='liblinear'),\n",
|
||||
" 'Random Forest Classifier': RandomForestClassifier(random_state=42),\n",
|
||||
" 'Gradient Boosting Classifier': GradientBoostingClassifier(random_state=42)\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Создание конвейера для классификации\n",
|
||||
"pipelines_class = {}\n",
|
||||
"for name, model in models_class.items():\n",
|
||||
" pipelines_class[name] = Pipeline([\n",
|
||||
" ('scaler', StandardScaler()),\n",
|
||||
" ('model', model)\n",
|
||||
" ])\n",
|
||||
"\n",
|
||||
"# Определение сетки гиперпараметров для классификации\n",
|
||||
"'''\n",
|
||||
"param_grids_class = {\n",
|
||||
" 'Logistic Regression': {\n",
|
||||
" 'model__C': [0.1, 1, 10],\n",
|
||||
" 'model__solver': ['lbfgs', 'liblinear']\n",
|
||||
" },\n",
|
||||
" 'Random Forest Classifier': {\n",
|
||||
" 'model__n_estimators': [100, 200, 300],\n",
|
||||
" 'model__max_depth': [None, 10, 20, 30]\n",
|
||||
" },\n",
|
||||
" 'Gradient Boosting Classifier': {\n",
|
||||
" 'model__n_estimators': [100, 200, 300],\n",
|
||||
" 'model__learning_rate': [0.01, 0.1, 0.2],\n",
|
||||
" 'model__max_depth': [3, 5, 7]\n",
|
||||
" }\n",
|
||||
"}'''\n",
|
||||
"# Убрал определение параметров поскольку уже был предподсчет данных, но вылетела ошибка. Сохранил лучшие параметры\n",
|
||||
"\n",
|
||||
"param_grids_class = {\n",
|
||||
" 'Logistic Regression': {\n",
|
||||
" 'model__C': [10],\n",
|
||||
" 'model__solver': ['lbfgs']\n",
|
||||
" },\n",
|
||||
" 'Random Forest Classifier': {\n",
|
||||
" 'model__n_estimators': [200],\n",
|
||||
" 'model__max_depth': [ 30]\n",
|
||||
" },\n",
|
||||
" 'Gradient Boosting Classifier': {\n",
|
||||
" 'model__n_estimators': [200],\n",
|
||||
" 'model__learning_rate': [0.1],\n",
|
||||
" 'model__max_depth': [7]\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Настройка гиперпараметров для классификации\n",
|
||||
"best_models_class = {}\n",
|
||||
"for name, pipeline in pipelines_class.items():\n",
|
||||
" grid_search = GridSearchCV(pipeline, param_grids_class[name], cv=5, scoring='accuracy')\n",
|
||||
" grid_search.fit(X_train_class, y_train_class)\n",
|
||||
" best_models_class[name] = grid_search.best_estimator_\n",
|
||||
" print(f'Best parameters for {name}: {grid_search.best_params_}')\n",
|
||||
"\n",
|
||||
"# Обучение моделей для классификации\n",
|
||||
"for name, model in best_models_class.items():\n",
|
||||
" model.fit(X_train_class, y_train_class)\n",
|
||||
"\n",
|
||||
"# Оценка качества моделей для классификации\n",
|
||||
"for name, model in best_models_class.items():\n",
|
||||
" y_pred_class = model.predict(X_test_class)\n",
|
||||
" accuracy = accuracy_score(y_test_class, y_pred_class)\n",
|
||||
" report = classification_report(y_test_class, y_pred_class)\n",
|
||||
" print(f'{name}: Accuracy = {accuracy}')\n",
|
||||
" print(f'Classification Report:\\n{report}')"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
Loading…
Reference in New Issue
Block a user