This commit is contained in:
Serxiolog 2024-11-22 16:07:46 +04:00
parent e4d193f30c
commit a2a0639a29

View File

@ -9,7 +9,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 52,
"metadata": {},
"outputs": [
{
@ -17,36 +17,42 @@
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 100000 entries, 0 to 99999\n",
"RangeIndex: 10000 entries, 0 to 9999\n",
"Data columns (total 10 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Id 100000 non-null object \n",
" 1 Name 100000 non-null object \n",
" 2 Short description 99923 non-null object \n",
" 3 Gender 98015 non-null object \n",
" 4 Country 94533 non-null object \n",
" 5 Occupation 97299 non-null object \n",
" 6 Birth year 100000 non-null int64 \n",
" 7 Death year 99999 non-null float64\n",
" 8 Manner of death 14821 non-null object \n",
" 9 Age of death 99999 non-null float64\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Id 10000 non-null object \n",
" 1 Name 10000 non-null object \n",
" 2 Short description 9996 non-null object \n",
" 3 Gender 9927 non-null object \n",
" 4 Country 9721 non-null object \n",
" 5 Occupation 9836 non-null object \n",
" 6 Birth year 10000 non-null int64 \n",
" 7 Death year 9999 non-null float64\n",
" 8 Manner of death 1893 non-null object \n",
" 9 Age of death 9999 non-null float64\n",
"dtypes: float64(2), int64(1), object(7)\n",
"memory usage: 7.6+ MB\n"
"memory usage: 781.4+ KB\n"
]
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier\n",
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
"from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score\n",
"import numpy as np\n",
"import featuretools as ft\n",
"\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"\n",
"# Функция для применения oversampling\n",
"def apply_oversampling(X, y):\n",
@ -131,7 +137,7 @@
" return df_train, df_val, df_test\n",
"\n",
"\n",
"df = pd.read_csv(\"../data/age.csv\", nrows=100000)\n",
"df = pd.read_csv(\"../data/age.csv\", nrows=10000)\n",
"df.info()"
]
},
@ -144,8 +150,7 @@
" 2) Исследование зависимости длительности жизни от страны проживания.\n",
" \n",
"Поскольку именно эти бизнес-цели были выбраны в предыдущей лабораторной работе, будем их использовать.\n",
"Но возникает проблема с 1 целью: её невозможно использовать для машинного обучения. Заменим ее на следующую:\n",
" Прогнозирование страны. Необходимо не имея такой параметр как страна примерно ее угадать для дальнейшей рекламы."
"Но возникает проблема с 1 целью: её невозможно использовать для задачи классификации. Заменим ее на классификацию людей по возрастным группам, что может быть полезно для рекламных целей."
]
},
{
@ -157,7 +162,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
@ -168,24 +173,306 @@
"data = df.copy()\n",
"\n",
"value_counts = data[\"Country\"].value_counts()\n",
"rare = value_counts[value_counts < 50].index\n",
"data = data[~data[\"Country\"].isin(rare)]"
"rare = value_counts[value_counts < 100].index\n",
"data = data[~data[\"Country\"].isin(rare)]\n",
"\n",
"data.drop(data[~data['Gender'].isin(['Male', 'Female'])].index, inplace=True)\n",
"\n",
"data1 = pd.get_dummies(data, columns=['Gender', 'Country', 'Occupation'], drop_first=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Определить достижимый уровень качества модели для каждой задачи. На основе имеющихся данных уровень качества моделей не будет высоким, поскольку все таки длительность жизни лишь примерная и точно ее угадать невозможно. А угадывание страны является трудной задачей, поскольку данные между людьми, живущими в разных странах, могут совпадать между собой."
"Определить достижимый уровень качества модели для каждой задачи. На основе имеющихся данных уровень качества моделей не будет высоким, поскольку все таки длительность жизни лишь примерная и точно ее угадать невозможно."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выберем ориентиры для наших 2х задач:\n",
" 1)Регрессии - средний возраст человека\n",
" 2)Классификации - аиболее часто встречающаяся возрастная группа"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Построим конвейер."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Id', 'Name', 'Short description', 'Birth year', 'Death year',\n",
" 'Age of death', 'Gender_Male', 'Country_France',\n",
" 'Country_German Confederation', 'Country_German Democratic Republic',\n",
" ...\n",
" 'Manner of death_euthanasia', 'Manner of death_homicide',\n",
" 'Manner of death_homicide; natural causes',\n",
" 'Manner of death_internal bleeding', 'Manner of death_natural causes',\n",
" 'Manner of death_suicide',\n",
" 'Manner of death_suicide; homicide; accident',\n",
" 'Manner of death_suicide; unfortunate accident',\n",
" 'Manner of death_summary execution', 'Manner of death_unnatural death'],\n",
" dtype='object', length=400)\n"
]
}
],
"source": [
"print(data.columns)\n"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters for Linear Regression: {}\n",
"Best parameters for Random Forest Regressor: {'model__max_depth': None, 'model__n_estimators': 100}\n",
"Best parameters for Gradient Boosting Regressor: {'model__learning_rate': 0.2, 'model__max_depth': 7, 'model__n_estimators': 300}\n",
"Linear Regression: MSE = 0.002807184047660083, R2 = 0.9999899555289343\n",
"Random Forest Regressor: MSE = 11.46917740409879, R2 = 0.9589617856804076\n",
"Gradient Boosting Regressor: MSE = 8.202651735797296, R2 = 0.9706498410424512\n"
]
}
],
"source": [
"X_reg = data1.drop(['Id', 'Name', 'Age of death', 'Short description', 'Manner of death'], axis=1)\n",
"y_reg = data1['Age of death']\n",
"\n",
"# Разделение данных\n",
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
"\n",
"# Выбор моделей для регрессии\n",
"models_reg = {\n",
" 'Linear Regression': LinearRegression(),\n",
" 'Random Forest Regressor': RandomForestRegressor(random_state=42),\n",
" 'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42)\n",
"}\n",
"\n",
"# Создание конвейера для регрессии\n",
"pipelines_reg = {}\n",
"for name, model in models_reg.items():\n",
" pipelines_reg[name] = Pipeline([\n",
" ('scaler', StandardScaler()),\n",
" ('model', model)\n",
" ])\n",
"\n",
"# Определение сетки гиперпараметров для регрессии\n",
"param_grids_reg = {\n",
" 'Linear Regression': {},\n",
" 'Random Forest Regressor': {\n",
" 'model__n_estimators': [100, 200, 300],\n",
" 'model__max_depth': [None, 10, 20, 30]\n",
" },\n",
" 'Gradient Boosting Regressor': {\n",
" 'model__n_estimators': [100, 200, 300],\n",
" 'model__learning_rate': [0.01, 0.1, 0.2],\n",
" 'model__max_depth': [3, 5, 7]\n",
" }\n",
"}\n",
"\n",
"# Настройка гиперпараметров для регрессии\n",
"best_models_reg = {}\n",
"for name, pipeline in pipelines_reg.items():\n",
" grid_search = GridSearchCV(pipeline, param_grids_reg[name], cv=5, scoring='neg_mean_squared_error')\n",
" grid_search.fit(X_train_reg, y_train_reg)\n",
" best_models_reg[name] = grid_search.best_estimator_\n",
" print(f'Best parameters for {name}: {grid_search.best_params_}')\n",
"\n",
"# Обучение моделей для регрессии\n",
"for name, model in best_models_reg.items():\n",
" model.fit(X_train_reg, y_train_reg)\n",
"\n",
"# Оценка качества моделей для регрессии\n",
"for name, model in best_models_reg.items():\n",
" y_pred_reg = model.predict(X_test_reg)\n",
" mse = mean_squared_error(y_test_reg, y_pred_reg)\n",
" r2 = r2_score(y_test_reg, y_pred_reg)\n",
" print(f'{name}: MSE = {mse}, R2 = {r2}')"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"data['Age_Category'] = pd.cut(data['Age'], bins=[0, 29, 59, float('inf')], labels=[\"young\", \"middle-aged\", \"old\"])"
"data2 = data.drop(['Short description', 'Manner of death', 'Gender', 'Country', 'Occupation'], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Birth year', 'Death year'], dtype='object')\n",
"Best parameters for Logistic Regression: {'model__C': 10, 'model__solver': 'lbfgs'}\n",
"Best parameters for Random Forest Classifier: {'model__max_depth': 30, 'model__n_estimators': 200}\n",
"Best parameters for Gradient Boosting Classifier: {'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__n_estimators': 200}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Logistic Regression: Accuracy = 0.9248554913294798\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 0-18 0.00 0.00 0.00 5\n",
" 19-30 0.50 0.08 0.14 60\n",
" 31-50 0.77 0.77 0.77 242\n",
" 51-70 0.91 0.96 0.94 650\n",
" 71+ 0.98 1.00 0.99 946\n",
"\n",
" accuracy 0.92 1903\n",
" macro avg 0.63 0.56 0.57 1903\n",
"weighted avg 0.91 0.92 0.91 1903\n",
"\n",
"Random Forest Classifier: Accuracy = 0.9485023646873357\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 0-18 0.67 0.40 0.50 5\n",
" 19-30 0.96 0.77 0.85 60\n",
" 31-50 0.88 0.89 0.88 242\n",
" 51-70 0.92 0.95 0.94 650\n",
" 71+ 0.99 0.97 0.98 946\n",
"\n",
" accuracy 0.95 1903\n",
" macro avg 0.88 0.80 0.83 1903\n",
"weighted avg 0.95 0.95 0.95 1903\n",
"\n",
"Gradient Boosting Classifier: Accuracy = 0.9379926431949553\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 0-18 1.00 0.40 0.57 5\n",
" 19-30 0.96 0.77 0.85 60\n",
" 31-50 0.87 0.87 0.87 242\n",
" 51-70 0.90 0.95 0.92 650\n",
" 71+ 0.98 0.96 0.97 946\n",
"\n",
" accuracy 0.94 1903\n",
" macro avg 0.94 0.79 0.84 1903\n",
"weighted avg 0.94 0.94 0.94 1903\n",
"\n"
]
}
],
"source": [
"# Создание возрастных групп\n",
"bins = [0, 18, 30, 50, 70, 100]\n",
"labels = ['0-18', '19-30', '31-50', '51-70', '71+']\n",
"data['Age Group'] = pd.cut(data['Age of death'], bins=bins, labels=labels)\n",
"\n",
"# Выбор признаков и целевой переменной для классификации\n",
"X_class = data2.drop(['Id', 'Name', 'Age of death', 'Age Group'], axis=1)\n",
"y_class = data['Age Group'] \n",
"print(X_class.columns)\n",
"# Разделение данных\n",
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
"\n",
"# Выбор моделей для классификации\n",
"models_class = {\n",
" 'Logistic Regression': LogisticRegression(random_state=42, max_iter=5000, solver='liblinear'),\n",
" 'Random Forest Classifier': RandomForestClassifier(random_state=42),\n",
" 'Gradient Boosting Classifier': GradientBoostingClassifier(random_state=42)\n",
"}\n",
"\n",
"# Создание конвейера для классификации\n",
"pipelines_class = {}\n",
"for name, model in models_class.items():\n",
" pipelines_class[name] = Pipeline([\n",
" ('scaler', StandardScaler()),\n",
" ('model', model)\n",
" ])\n",
"\n",
"# Определение сетки гиперпараметров для классификации\n",
"'''\n",
"param_grids_class = {\n",
" 'Logistic Regression': {\n",
" 'model__C': [0.1, 1, 10],\n",
" 'model__solver': ['lbfgs', 'liblinear']\n",
" },\n",
" 'Random Forest Classifier': {\n",
" 'model__n_estimators': [100, 200, 300],\n",
" 'model__max_depth': [None, 10, 20, 30]\n",
" },\n",
" 'Gradient Boosting Classifier': {\n",
" 'model__n_estimators': [100, 200, 300],\n",
" 'model__learning_rate': [0.01, 0.1, 0.2],\n",
" 'model__max_depth': [3, 5, 7]\n",
" }\n",
"}'''\n",
"# Убрал определение параметров поскольку уже был предподсчет данных, но вылетела ошибка. Сохранил лучшие параметры\n",
"\n",
"param_grids_class = {\n",
" 'Logistic Regression': {\n",
" 'model__C': [10],\n",
" 'model__solver': ['lbfgs']\n",
" },\n",
" 'Random Forest Classifier': {\n",
" 'model__n_estimators': [200],\n",
" 'model__max_depth': [ 30]\n",
" },\n",
" 'Gradient Boosting Classifier': {\n",
" 'model__n_estimators': [200],\n",
" 'model__learning_rate': [0.1],\n",
" 'model__max_depth': [7]\n",
" }\n",
"}\n",
"\n",
"# Настройка гиперпараметров для классификации\n",
"best_models_class = {}\n",
"for name, pipeline in pipelines_class.items():\n",
" grid_search = GridSearchCV(pipeline, param_grids_class[name], cv=5, scoring='accuracy')\n",
" grid_search.fit(X_train_class, y_train_class)\n",
" best_models_class[name] = grid_search.best_estimator_\n",
" print(f'Best parameters for {name}: {grid_search.best_params_}')\n",
"\n",
"# Обучение моделей для классификации\n",
"for name, model in best_models_class.items():\n",
" model.fit(X_train_class, y_train_class)\n",
"\n",
"# Оценка качества моделей для классификации\n",
"for name, model in best_models_class.items():\n",
" y_pred_class = model.predict(X_test_class)\n",
" accuracy = accuracy_score(y_test_class, y_pred_class)\n",
" report = classification_report(y_test_class, y_pred_class)\n",
" print(f'{name}: Accuracy = {accuracy}')\n",
" print(f'Classification Report:\\n{report}')"
]
}
],