AIM-PIbd-31-Kozyrev-S-S/lab_4/lab_4.ipynb
2024-11-22 16:07:46 +04:00

501 lines
22 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Вариант: Список людей. "
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 10000 entries, 0 to 9999\n",
"Data columns (total 10 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Id 10000 non-null object \n",
" 1 Name 10000 non-null object \n",
" 2 Short description 9996 non-null object \n",
" 3 Gender 9927 non-null object \n",
" 4 Country 9721 non-null object \n",
" 5 Occupation 9836 non-null object \n",
" 6 Birth year 10000 non-null int64 \n",
" 7 Death year 9999 non-null float64\n",
" 8 Manner of death 1893 non-null object \n",
" 9 Age of death 9999 non-null float64\n",
"dtypes: float64(2), int64(1), object(7)\n",
"memory usage: 781.4+ KB\n"
]
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier\n",
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
"from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score\n",
"import numpy as np\n",
"import featuretools as ft\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"\n",
"# Функция для применения oversampling\n",
"def apply_oversampling(X, y):\n",
" oversampler = RandomOverSampler(random_state=42)\n",
" X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
" return X_resampled, y_resampled\n",
"\n",
"# Функция для применения undersampling\n",
"def apply_undersampling(X, y):\n",
" undersampler = RandomUnderSampler(random_state=42)\n",
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
" return X_resampled, y_resampled\n",
"\n",
"def split_stratified_into_train_val_test(\n",
" df_input,\n",
" stratify_colname=\"y\",\n",
" frac_train=0.6,\n",
" frac_val=0.15,\n",
" frac_test=0.25,\n",
" random_state=None,\n",
"):\n",
" \"\"\"\n",
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
" following fractional ratios provided by the user, where each subset is\n",
" stratified by the values in a specific column (that is, each subset has\n",
" the same relative frequency of the values in the column). It performs this\n",
" splitting by running train_test_split() twice.\n",
"\n",
" Parameters\n",
" ----------\n",
" df_input : Pandas dataframe\n",
" Input dataframe to be split.\n",
" stratify_colname : str\n",
" The name of the column that will be used for stratification. Usually\n",
" this column would be for the label.\n",
" frac_train : float\n",
" frac_val : float\n",
" frac_test : float\n",
" The ratios with which the dataframe will be split into train, val, and\n",
" test data. The values should be expressed as float fractions and should\n",
" sum to 1.0.\n",
" random_state : int, None, or RandomStateInstance\n",
" Value to be passed to train_test_split().\n",
"\n",
" Returns\n",
" -------\n",
" df_train, df_val, df_test :\n",
" Dataframes containing the three splits.\n",
" \"\"\"\n",
"\n",
" if frac_train + frac_val + frac_test != 1.0:\n",
" raise ValueError(\n",
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
" % (frac_train, frac_val, frac_test)\n",
" )\n",
"\n",
" if stratify_colname not in df_input.columns:\n",
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
"\n",
" X = df_input # Contains all columns.\n",
" y = df_input[\n",
" [stratify_colname]\n",
" ] # Dataframe of just the column on which to stratify.\n",
"\n",
" # Split original dataframe into train and temp dataframes.\n",
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
" )\n",
"\n",
" # Split the temp dataframe into val and test dataframes.\n",
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
" df_val, df_test, y_val, y_test = train_test_split(\n",
" df_temp,\n",
" y_temp,\n",
" stratify=y_temp,\n",
" test_size=relative_frac_test,\n",
" random_state=random_state,\n",
" )\n",
"\n",
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
"\n",
" return df_train, df_val, df_test\n",
"\n",
"\n",
"df = pd.read_csv(\"../data/age.csv\", nrows=10000)\n",
"df.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Как бизнес-цели выделим следующие 2 варианта:\n",
" 1) GameDev. Создание игры про конкретного персонажа, живущего в конкретном временном промежутке в конкретной стране. \n",
" 2) Исследование зависимости длительности жизни от страны проживания.\n",
" \n",
"Поскольку именно эти бизнес-цели были выбраны в предыдущей лабораторной работе, будем их использовать.\n",
"Но возникает проблема с 1 целью: её невозможно использовать для задачи классификации. Заменим ее на классификацию людей по возрастным группам, что может быть полезно для рекламных целей."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выполним подготовку данных"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"df.fillna({\"Gender\": \"NaN\", \"Country\": \"NaN\", \"Occupation\" : \"NaN\", \"Manner of death\" : \"NaN\"}, inplace=True)\n",
"df = df.dropna()\n",
"df['Country'] = df['Country'].str.split('; ')\n",
"df = df.explode('Country')\n",
"data = df.copy()\n",
"\n",
"value_counts = data[\"Country\"].value_counts()\n",
"rare = value_counts[value_counts < 100].index\n",
"data = data[~data[\"Country\"].isin(rare)]\n",
"\n",
"data.drop(data[~data['Gender'].isin(['Male', 'Female'])].index, inplace=True)\n",
"\n",
"data1 = pd.get_dummies(data, columns=['Gender', 'Country', 'Occupation'], drop_first=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Определить достижимый уровень качества модели для каждой задачи. На основе имеющихся данных уровень качества моделей не будет высоким, поскольку все таки длительность жизни лишь примерная и точно ее угадать невозможно."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выберем ориентиры для наших 2х задач:\n",
" 1)Регрессии - средний возраст человека\n",
" 2)Классификации - аиболее часто встречающаяся возрастная группа"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Построим конвейер."
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Id', 'Name', 'Short description', 'Birth year', 'Death year',\n",
" 'Age of death', 'Gender_Male', 'Country_France',\n",
" 'Country_German Confederation', 'Country_German Democratic Republic',\n",
" ...\n",
" 'Manner of death_euthanasia', 'Manner of death_homicide',\n",
" 'Manner of death_homicide; natural causes',\n",
" 'Manner of death_internal bleeding', 'Manner of death_natural causes',\n",
" 'Manner of death_suicide',\n",
" 'Manner of death_suicide; homicide; accident',\n",
" 'Manner of death_suicide; unfortunate accident',\n",
" 'Manner of death_summary execution', 'Manner of death_unnatural death'],\n",
" dtype='object', length=400)\n"
]
}
],
"source": [
"print(data.columns)\n"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters for Linear Regression: {}\n",
"Best parameters for Random Forest Regressor: {'model__max_depth': None, 'model__n_estimators': 100}\n",
"Best parameters for Gradient Boosting Regressor: {'model__learning_rate': 0.2, 'model__max_depth': 7, 'model__n_estimators': 300}\n",
"Linear Regression: MSE = 0.002807184047660083, R2 = 0.9999899555289343\n",
"Random Forest Regressor: MSE = 11.46917740409879, R2 = 0.9589617856804076\n",
"Gradient Boosting Regressor: MSE = 8.202651735797296, R2 = 0.9706498410424512\n"
]
}
],
"source": [
"X_reg = data1.drop(['Id', 'Name', 'Age of death', 'Short description', 'Manner of death'], axis=1)\n",
"y_reg = data1['Age of death']\n",
"\n",
"# Разделение данных\n",
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
"\n",
"# Выбор моделей для регрессии\n",
"models_reg = {\n",
" 'Linear Regression': LinearRegression(),\n",
" 'Random Forest Regressor': RandomForestRegressor(random_state=42),\n",
" 'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42)\n",
"}\n",
"\n",
"# Создание конвейера для регрессии\n",
"pipelines_reg = {}\n",
"for name, model in models_reg.items():\n",
" pipelines_reg[name] = Pipeline([\n",
" ('scaler', StandardScaler()),\n",
" ('model', model)\n",
" ])\n",
"\n",
"# Определение сетки гиперпараметров для регрессии\n",
"param_grids_reg = {\n",
" 'Linear Regression': {},\n",
" 'Random Forest Regressor': {\n",
" 'model__n_estimators': [100, 200, 300],\n",
" 'model__max_depth': [None, 10, 20, 30]\n",
" },\n",
" 'Gradient Boosting Regressor': {\n",
" 'model__n_estimators': [100, 200, 300],\n",
" 'model__learning_rate': [0.01, 0.1, 0.2],\n",
" 'model__max_depth': [3, 5, 7]\n",
" }\n",
"}\n",
"\n",
"# Настройка гиперпараметров для регрессии\n",
"best_models_reg = {}\n",
"for name, pipeline in pipelines_reg.items():\n",
" grid_search = GridSearchCV(pipeline, param_grids_reg[name], cv=5, scoring='neg_mean_squared_error')\n",
" grid_search.fit(X_train_reg, y_train_reg)\n",
" best_models_reg[name] = grid_search.best_estimator_\n",
" print(f'Best parameters for {name}: {grid_search.best_params_}')\n",
"\n",
"# Обучение моделей для регрессии\n",
"for name, model in best_models_reg.items():\n",
" model.fit(X_train_reg, y_train_reg)\n",
"\n",
"# Оценка качества моделей для регрессии\n",
"for name, model in best_models_reg.items():\n",
" y_pred_reg = model.predict(X_test_reg)\n",
" mse = mean_squared_error(y_test_reg, y_pred_reg)\n",
" r2 = r2_score(y_test_reg, y_pred_reg)\n",
" print(f'{name}: MSE = {mse}, R2 = {r2}')"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"data2 = data.drop(['Short description', 'Manner of death', 'Gender', 'Country', 'Occupation'], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Birth year', 'Death year'], dtype='object')\n",
"Best parameters for Logistic Regression: {'model__C': 10, 'model__solver': 'lbfgs'}\n",
"Best parameters for Random Forest Classifier: {'model__max_depth': 30, 'model__n_estimators': 200}\n",
"Best parameters for Gradient Boosting Classifier: {'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__n_estimators': 200}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Logistic Regression: Accuracy = 0.9248554913294798\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 0-18 0.00 0.00 0.00 5\n",
" 19-30 0.50 0.08 0.14 60\n",
" 31-50 0.77 0.77 0.77 242\n",
" 51-70 0.91 0.96 0.94 650\n",
" 71+ 0.98 1.00 0.99 946\n",
"\n",
" accuracy 0.92 1903\n",
" macro avg 0.63 0.56 0.57 1903\n",
"weighted avg 0.91 0.92 0.91 1903\n",
"\n",
"Random Forest Classifier: Accuracy = 0.9485023646873357\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 0-18 0.67 0.40 0.50 5\n",
" 19-30 0.96 0.77 0.85 60\n",
" 31-50 0.88 0.89 0.88 242\n",
" 51-70 0.92 0.95 0.94 650\n",
" 71+ 0.99 0.97 0.98 946\n",
"\n",
" accuracy 0.95 1903\n",
" macro avg 0.88 0.80 0.83 1903\n",
"weighted avg 0.95 0.95 0.95 1903\n",
"\n",
"Gradient Boosting Classifier: Accuracy = 0.9379926431949553\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 0-18 1.00 0.40 0.57 5\n",
" 19-30 0.96 0.77 0.85 60\n",
" 31-50 0.87 0.87 0.87 242\n",
" 51-70 0.90 0.95 0.92 650\n",
" 71+ 0.98 0.96 0.97 946\n",
"\n",
" accuracy 0.94 1903\n",
" macro avg 0.94 0.79 0.84 1903\n",
"weighted avg 0.94 0.94 0.94 1903\n",
"\n"
]
}
],
"source": [
"# Создание возрастных групп\n",
"bins = [0, 18, 30, 50, 70, 100]\n",
"labels = ['0-18', '19-30', '31-50', '51-70', '71+']\n",
"data['Age Group'] = pd.cut(data['Age of death'], bins=bins, labels=labels)\n",
"\n",
"# Выбор признаков и целевой переменной для классификации\n",
"X_class = data2.drop(['Id', 'Name', 'Age of death', 'Age Group'], axis=1)\n",
"y_class = data['Age Group'] \n",
"print(X_class.columns)\n",
"# Разделение данных\n",
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
"\n",
"# Выбор моделей для классификации\n",
"models_class = {\n",
" 'Logistic Regression': LogisticRegression(random_state=42, max_iter=5000, solver='liblinear'),\n",
" 'Random Forest Classifier': RandomForestClassifier(random_state=42),\n",
" 'Gradient Boosting Classifier': GradientBoostingClassifier(random_state=42)\n",
"}\n",
"\n",
"# Создание конвейера для классификации\n",
"pipelines_class = {}\n",
"for name, model in models_class.items():\n",
" pipelines_class[name] = Pipeline([\n",
" ('scaler', StandardScaler()),\n",
" ('model', model)\n",
" ])\n",
"\n",
"# Определение сетки гиперпараметров для классификации\n",
"'''\n",
"param_grids_class = {\n",
" 'Logistic Regression': {\n",
" 'model__C': [0.1, 1, 10],\n",
" 'model__solver': ['lbfgs', 'liblinear']\n",
" },\n",
" 'Random Forest Classifier': {\n",
" 'model__n_estimators': [100, 200, 300],\n",
" 'model__max_depth': [None, 10, 20, 30]\n",
" },\n",
" 'Gradient Boosting Classifier': {\n",
" 'model__n_estimators': [100, 200, 300],\n",
" 'model__learning_rate': [0.01, 0.1, 0.2],\n",
" 'model__max_depth': [3, 5, 7]\n",
" }\n",
"}'''\n",
"# Убрал определение параметров поскольку уже был предподсчет данных, но вылетела ошибка. Сохранил лучшие параметры\n",
"\n",
"param_grids_class = {\n",
" 'Logistic Regression': {\n",
" 'model__C': [10],\n",
" 'model__solver': ['lbfgs']\n",
" },\n",
" 'Random Forest Classifier': {\n",
" 'model__n_estimators': [200],\n",
" 'model__max_depth': [ 30]\n",
" },\n",
" 'Gradient Boosting Classifier': {\n",
" 'model__n_estimators': [200],\n",
" 'model__learning_rate': [0.1],\n",
" 'model__max_depth': [7]\n",
" }\n",
"}\n",
"\n",
"# Настройка гиперпараметров для классификации\n",
"best_models_class = {}\n",
"for name, pipeline in pipelines_class.items():\n",
" grid_search = GridSearchCV(pipeline, param_grids_class[name], cv=5, scoring='accuracy')\n",
" grid_search.fit(X_train_class, y_train_class)\n",
" best_models_class[name] = grid_search.best_estimator_\n",
" print(f'Best parameters for {name}: {grid_search.best_params_}')\n",
"\n",
"# Обучение моделей для классификации\n",
"for name, model in best_models_class.items():\n",
" model.fit(X_train_class, y_train_class)\n",
"\n",
"# Оценка качества моделей для классификации\n",
"for name, model in best_models_class.items():\n",
" y_pred_class = model.predict(X_test_class)\n",
" accuracy = accuracy_score(y_test_class, y_pred_class)\n",
" report = classification_report(y_test_class, y_pred_class)\n",
" print(f'{name}: Accuracy = {accuracy}')\n",
" print(f'Classification Report:\\n{report}')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "aimvenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}